diff mbox series

[V6,6/8] fs/xfs: Combine xfs_diflags_to_linux() and xfs_diflags_to_iflags()

Message ID 20200407182958.568475-7-ira.weiny@intel.com (mailing list archive)
State Superseded, archived
Headers show
Series Enable per-file/per-directory DAX operations V6 | expand

Commit Message

Ira Weiny April 7, 2020, 6:29 p.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

The functionality in xfs_diflags_to_linux() and xfs_diflags_to_iflags() are
nearly identical.  The only difference is that *_to_linux() is called after
inode setup and disallows changing the DAX flag.

Combining them can be done with a flag which indicates if this is the initial
setup to allow the DAX flag to be properly set only at init time.

So remove xfs_diflags_to_linux() and call the modified xfs_diflags_to_iflags()
directly.

While we are here simplify xfs_diflags_to_iflags() to take struct xfs_inode and
use xfs_ip2xflags() to ensure future diflags are included correctly.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes from V5:
	The functions are no longer identical so we can only combine
	them rather than deleting one completely.  This is reflected in
	the new init parameter.
---
 fs/xfs/xfs_inode.h |  1 +
 fs/xfs/xfs_ioctl.c | 33 +--------------------------------
 fs/xfs/xfs_iops.c  | 42 +++++++++++++++++++++++++++---------------
 3 files changed, 29 insertions(+), 47 deletions(-)

Comments

Dave Chinner April 8, 2020, 2:08 a.m. UTC | #1
On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
> 
> The functionality in xfs_diflags_to_linux() and xfs_diflags_to_iflags() are
> nearly identical.  The only difference is that *_to_linux() is called after
> inode setup and disallows changing the DAX flag.
> 
> Combining them can be done with a flag which indicates if this is the initial
> setup to allow the DAX flag to be properly set only at init time.
> 
> So remove xfs_diflags_to_linux() and call the modified xfs_diflags_to_iflags()
> directly.
> 
> While we are here simplify xfs_diflags_to_iflags() to take struct xfs_inode and
> use xfs_ip2xflags() to ensure future diflags are included correctly.
> 
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes from V5:
> 	The functions are no longer identical so we can only combine
> 	them rather than deleting one completely.  This is reflected in
> 	the new init parameter.
> ---
>  fs/xfs/xfs_inode.h |  1 +
>  fs/xfs/xfs_ioctl.c | 33 +--------------------------------
>  fs/xfs/xfs_iops.c  | 42 +++++++++++++++++++++++++++---------------
>  3 files changed, 29 insertions(+), 47 deletions(-)
> 
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 492e53992fa9..e76ed9ca17f7 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -466,6 +466,7 @@ int	xfs_break_layouts(struct inode *inode, uint *iolock,
>  /* from xfs_iops.c */
>  extern void xfs_setup_inode(struct xfs_inode *ip);
>  extern void xfs_setup_iops(struct xfs_inode *ip);
> +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
>  
>  /*
>   * When setting up a newly allocated inode, we need to call
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index d42de92cb283..c6cd92ef4a05 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1100,37 +1100,6 @@ xfs_flags2diflags2(
>  	return di_flags2;
>  }
>  
> -STATIC void
> -xfs_diflags_to_linux(
> -	struct xfs_inode	*ip)
> -{
> -	struct inode		*inode = VFS_I(ip);
> -	unsigned int		xflags = xfs_ip2xflags(ip);
> -
> -	if (xflags & FS_XFLAG_IMMUTABLE)
> -		inode->i_flags |= S_IMMUTABLE;
> -	else
> -		inode->i_flags &= ~S_IMMUTABLE;
> -	if (xflags & FS_XFLAG_APPEND)
> -		inode->i_flags |= S_APPEND;
> -	else
> -		inode->i_flags &= ~S_APPEND;
> -	if (xflags & FS_XFLAG_SYNC)
> -		inode->i_flags |= S_SYNC;
> -	else
> -		inode->i_flags &= ~S_SYNC;
> -	if (xflags & FS_XFLAG_NOATIME)
> -		inode->i_flags |= S_NOATIME;
> -	else
> -		inode->i_flags &= ~S_NOATIME;
> -#if 0	/* disabled until the flag switching races are sorted out */
> -	if (xflags & FS_XFLAG_DAX)
> -		inode->i_flags |= S_DAX;
> -	else
> -		inode->i_flags &= ~S_DAX;
> -#endif

So this variant will set the flag in the inode if the disk inode
flag is set, otherwise it will clear it.  It does it with if/else
branches.


> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index e07f7b641226..a4ac8568c8c7 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax(
>  	return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
>  }
>  
> -STATIC bool
> +static bool
>  xfs_inode_enable_dax(
>  	struct xfs_inode *ip)
>  {

This belongs in the previous patch.

> @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax(
>  	return false;
>  }
>  
> -STATIC void
> +void
>  xfs_diflags_to_iflags(
> -	struct inode		*inode,
> -	struct xfs_inode	*ip)
> +	struct xfs_inode	*ip,
> +	bool init)
>  {
> -	uint16_t		flags = ip->i_d.di_flags;
> -
> -	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
> -			    S_NOATIME | S_DAX);

And this code cleared all the flags in the inode first, then
set them if the disk inode flag is set. This does not require
branches, resulting in more readable code and better code
generation.

> +	struct inode		*inode = VFS_I(ip);
> +	uint			diflags = xfs_ip2xflags(ip);
>  
> -	if (flags & XFS_DIFLAG_IMMUTABLE)
> +	if (diflags & FS_XFLAG_IMMUTABLE)
>  		inode->i_flags |= S_IMMUTABLE;
> -	if (flags & XFS_DIFLAG_APPEND)
> +	else
> +		inode->i_flags &= ~S_IMMUTABLE;

> +	if (diflags & FS_XFLAG_APPEND)
>  		inode->i_flags |= S_APPEND;
> -	if (flags & XFS_DIFLAG_SYNC)
> +	else
> +		inode->i_flags &= ~S_APPEND;
> +	if (diflags & FS_XFLAG_SYNC)
>  		inode->i_flags |= S_SYNC;
> -	if (flags & XFS_DIFLAG_NOATIME)
> +	else
> +		inode->i_flags &= ~S_SYNC;
> +	if (diflags & FS_XFLAG_NOATIME)
>  		inode->i_flags |= S_NOATIME;
> -	if (xfs_inode_enable_dax(ip))
> -		inode->i_flags |= S_DAX;
> +	else
> +		inode->i_flags &= ~S_NOATIME;
> +
> +	/* Only toggle the dax flag when initializing */
> +	if (init) {
> +		if (xfs_inode_enable_dax(ip))
> +			inode->i_flags |= S_DAX;
> +		else
> +			inode->i_flags &= ~S_DAX;
> +	}
>  }

IOWs, this:

        struct inode            *inode = VFS_I(ip);
        unsigned int            xflags = xfs_ip2xflags(ip);
        unsigned int            flags = 0;

        if (xflags & FS_XFLAG_IMMUTABLE)
                flags |= S_IMMUTABLE;
        if (xflags & FS_XFLAG_APPEND)
                flags |= S_APPEND;
        if (xflags & FS_XFLAG_SYNC)
                flags |= S_SYNC;
        if (xflags & FS_XFLAG_NOATIME)
                flags |= S_NOATIME;
	if ((xflags & FS_XFLAG_DAX) && init)
		flags |= S_DAX;

        inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
        inode->i_flags |= flags;

ends up being much easier to read and results in better code
generation. And we don't need to clear the S_DAX flag when "init" is
set, because we are starting from an inode that has no flags set
(because init!)...

Cheers,

Dave.
Ira Weiny April 8, 2020, 5:09 p.m. UTC | #2
On Wed, Apr 08, 2020 at 12:08:27PM +1000, Dave Chinner wrote:
> On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote:
> > From: Ira Weiny <ira.weiny@intel.com>

[snip]

> >  
> > -STATIC void
> > -xfs_diflags_to_linux(
> > -	struct xfs_inode	*ip)
> > -{
> > -	struct inode		*inode = VFS_I(ip);
> > -	unsigned int		xflags = xfs_ip2xflags(ip);
> > -
> > -	if (xflags & FS_XFLAG_IMMUTABLE)
> > -		inode->i_flags |= S_IMMUTABLE;
> > -	else
> > -		inode->i_flags &= ~S_IMMUTABLE;
> > -	if (xflags & FS_XFLAG_APPEND)
> > -		inode->i_flags |= S_APPEND;
> > -	else
> > -		inode->i_flags &= ~S_APPEND;
> > -	if (xflags & FS_XFLAG_SYNC)
> > -		inode->i_flags |= S_SYNC;
> > -	else
> > -		inode->i_flags &= ~S_SYNC;
> > -	if (xflags & FS_XFLAG_NOATIME)
> > -		inode->i_flags |= S_NOATIME;
> > -	else
> > -		inode->i_flags &= ~S_NOATIME;
> > -#if 0	/* disabled until the flag switching races are sorted out */
> > -	if (xflags & FS_XFLAG_DAX)
> > -		inode->i_flags |= S_DAX;
> > -	else
> > -		inode->i_flags &= ~S_DAX;
> > -#endif
> 
> So this variant will set the flag in the inode if the disk inode
> flag is set, otherwise it will clear it.  It does it with if/else
> branches.
> 
> 
> > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > index e07f7b641226..a4ac8568c8c7 100644
> > --- a/fs/xfs/xfs_iops.c
> > +++ b/fs/xfs/xfs_iops.c
> > @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax(
> >  	return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
> >  }
> >  
> > -STATIC bool
> > +static bool
> >  xfs_inode_enable_dax(
> >  	struct xfs_inode *ip)
> >  {
> 
> This belongs in the previous patch.

Ah yea...  Sorry.

Fixed in V7

> 
> > @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax(
> >  	return false;
> >  }
> >  
> > -STATIC void
> > +void
> >  xfs_diflags_to_iflags(
> > -	struct inode		*inode,
> > -	struct xfs_inode	*ip)
> > +	struct xfs_inode	*ip,
> > +	bool init)
> >  {
> > -	uint16_t		flags = ip->i_d.di_flags;
> > -
> > -	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
> > -			    S_NOATIME | S_DAX);
> 
> And this code cleared all the flags in the inode first, then
> set them if the disk inode flag is set. This does not require
> branches, resulting in more readable code and better code
> generation.
> 
> > +	struct inode		*inode = VFS_I(ip);
> > +	uint			diflags = xfs_ip2xflags(ip);
> >  
> > -	if (flags & XFS_DIFLAG_IMMUTABLE)
> > +	if (diflags & FS_XFLAG_IMMUTABLE)
> >  		inode->i_flags |= S_IMMUTABLE;
> > -	if (flags & XFS_DIFLAG_APPEND)
> > +	else
> > +		inode->i_flags &= ~S_IMMUTABLE;
> 
> > +	if (diflags & FS_XFLAG_APPEND)
> >  		inode->i_flags |= S_APPEND;
> > -	if (flags & XFS_DIFLAG_SYNC)
> > +	else
> > +		inode->i_flags &= ~S_APPEND;
> > +	if (diflags & FS_XFLAG_SYNC)
> >  		inode->i_flags |= S_SYNC;
> > -	if (flags & XFS_DIFLAG_NOATIME)
> > +	else
> > +		inode->i_flags &= ~S_SYNC;
> > +	if (diflags & FS_XFLAG_NOATIME)
> >  		inode->i_flags |= S_NOATIME;
> > -	if (xfs_inode_enable_dax(ip))
> > -		inode->i_flags |= S_DAX;
> > +	else
> > +		inode->i_flags &= ~S_NOATIME;
> > +
> > +	/* Only toggle the dax flag when initializing */
> > +	if (init) {
> > +		if (xfs_inode_enable_dax(ip))
> > +			inode->i_flags |= S_DAX;
> > +		else
> > +			inode->i_flags &= ~S_DAX;
> > +	}
> >  }
> 
> IOWs, this:
> 
>         struct inode            *inode = VFS_I(ip);
>         unsigned int            xflags = xfs_ip2xflags(ip);
>         unsigned int            flags = 0;
> 
>         if (xflags & FS_XFLAG_IMMUTABLE)
>                 flags |= S_IMMUTABLE;
>         if (xflags & FS_XFLAG_APPEND)
>                 flags |= S_APPEND;
>         if (xflags & FS_XFLAG_SYNC)
>                 flags |= S_SYNC;
>         if (xflags & FS_XFLAG_NOATIME)
>                 flags |= S_NOATIME;
> 	if ((xflags & FS_XFLAG_DAX) && init)
> 		flags |= S_DAX;
> 
>         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
>         inode->i_flags |= flags;
> 
> ends up being much easier to read and results in better code
> generation. And we don't need to clear the S_DAX flag when "init" is
> set, because we are starting from an inode that has no flags set
> (because init!)...

This sounds good but I think we need a slight modification to make the function equivalent in functionality.

void
xfs_diflags_to_iflags(
        struct xfs_inode        *ip,
        bool init)
{
        struct inode            *inode = VFS_I(ip);
        unsigned int            xflags = xfs_ip2xflags(ip);
        unsigned int            flags = 0;

        inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
                            S_DAX);

        if (xflags & FS_XFLAG_IMMUTABLE)
                flags |= S_IMMUTABLE;
        if (xflags & FS_XFLAG_APPEND)
                flags |= S_APPEND;
        if (xflags & FS_XFLAG_SYNC)
                flags |= S_SYNC;
        if (xflags & FS_XFLAG_NOATIME)
                flags |= S_NOATIME;
        if (init && xfs_inode_enable_dax(ip))
                flags |= S_DAX;

        inode->i_flags |= flags;
}

I've queued this for v7.

Thanks,
Ira

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner April 8, 2020, 9:02 p.m. UTC | #3
On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote:
> On Wed, Apr 08, 2020 at 12:08:27PM +1000, Dave Chinner wrote:
> > On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote:
> > > From: Ira Weiny <ira.weiny@intel.com>
> 
> [snip]
> 
> > >  
> > > -STATIC void
> > > -xfs_diflags_to_linux(
> > > -	struct xfs_inode	*ip)
> > > -{
> > > -	struct inode		*inode = VFS_I(ip);
> > > -	unsigned int		xflags = xfs_ip2xflags(ip);
> > > -
> > > -	if (xflags & FS_XFLAG_IMMUTABLE)
> > > -		inode->i_flags |= S_IMMUTABLE;
> > > -	else
> > > -		inode->i_flags &= ~S_IMMUTABLE;
> > > -	if (xflags & FS_XFLAG_APPEND)
> > > -		inode->i_flags |= S_APPEND;
> > > -	else
> > > -		inode->i_flags &= ~S_APPEND;
> > > -	if (xflags & FS_XFLAG_SYNC)
> > > -		inode->i_flags |= S_SYNC;
> > > -	else
> > > -		inode->i_flags &= ~S_SYNC;
> > > -	if (xflags & FS_XFLAG_NOATIME)
> > > -		inode->i_flags |= S_NOATIME;
> > > -	else
> > > -		inode->i_flags &= ~S_NOATIME;
> > > -#if 0	/* disabled until the flag switching races are sorted out */
> > > -	if (xflags & FS_XFLAG_DAX)
> > > -		inode->i_flags |= S_DAX;
> > > -	else
> > > -		inode->i_flags &= ~S_DAX;
> > > -#endif
> > 
> > So this variant will set the flag in the inode if the disk inode
> > flag is set, otherwise it will clear it.  It does it with if/else
> > branches.
> > 
> > 
> > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > > index e07f7b641226..a4ac8568c8c7 100644
> > > --- a/fs/xfs/xfs_iops.c
> > > +++ b/fs/xfs/xfs_iops.c
> > > @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax(
> > >  	return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
> > >  }
> > >  
> > > -STATIC bool
> > > +static bool
> > >  xfs_inode_enable_dax(
> > >  	struct xfs_inode *ip)
> > >  {
> > 
> > This belongs in the previous patch.
> 
> Ah yea...  Sorry.
> 
> Fixed in V7
> 
> > 
> > > @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax(
> > >  	return false;
> > >  }
> > >  
> > > -STATIC void
> > > +void
> > >  xfs_diflags_to_iflags(
> > > -	struct inode		*inode,
> > > -	struct xfs_inode	*ip)
> > > +	struct xfs_inode	*ip,
> > > +	bool init)
> > >  {
> > > -	uint16_t		flags = ip->i_d.di_flags;
> > > -
> > > -	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
> > > -			    S_NOATIME | S_DAX);
> > 
> > And this code cleared all the flags in the inode first, then
> > set them if the disk inode flag is set. This does not require
> > branches, resulting in more readable code and better code
> > generation.
> > 
> > > +	struct inode		*inode = VFS_I(ip);
> > > +	uint			diflags = xfs_ip2xflags(ip);
> > >  
> > > -	if (flags & XFS_DIFLAG_IMMUTABLE)
> > > +	if (diflags & FS_XFLAG_IMMUTABLE)
> > >  		inode->i_flags |= S_IMMUTABLE;
> > > -	if (flags & XFS_DIFLAG_APPEND)
> > > +	else
> > > +		inode->i_flags &= ~S_IMMUTABLE;
> > 
> > > +	if (diflags & FS_XFLAG_APPEND)
> > >  		inode->i_flags |= S_APPEND;
> > > -	if (flags & XFS_DIFLAG_SYNC)
> > > +	else
> > > +		inode->i_flags &= ~S_APPEND;
> > > +	if (diflags & FS_XFLAG_SYNC)
> > >  		inode->i_flags |= S_SYNC;
> > > -	if (flags & XFS_DIFLAG_NOATIME)
> > > +	else
> > > +		inode->i_flags &= ~S_SYNC;
> > > +	if (diflags & FS_XFLAG_NOATIME)
> > >  		inode->i_flags |= S_NOATIME;
> > > -	if (xfs_inode_enable_dax(ip))
> > > -		inode->i_flags |= S_DAX;
> > > +	else
> > > +		inode->i_flags &= ~S_NOATIME;
> > > +
> > > +	/* Only toggle the dax flag when initializing */
> > > +	if (init) {
> > > +		if (xfs_inode_enable_dax(ip))
> > > +			inode->i_flags |= S_DAX;
> > > +		else
> > > +			inode->i_flags &= ~S_DAX;
> > > +	}
> > >  }
> > 
> > IOWs, this:
> > 
> >         struct inode            *inode = VFS_I(ip);
> >         unsigned int            xflags = xfs_ip2xflags(ip);
> >         unsigned int            flags = 0;
> > 
> >         if (xflags & FS_XFLAG_IMMUTABLE)
> >                 flags |= S_IMMUTABLE;
> >         if (xflags & FS_XFLAG_APPEND)
> >                 flags |= S_APPEND;
> >         if (xflags & FS_XFLAG_SYNC)
> >                 flags |= S_SYNC;
> >         if (xflags & FS_XFLAG_NOATIME)
> >                 flags |= S_NOATIME;
> > 	if ((xflags & FS_XFLAG_DAX) && init)
> > 		flags |= S_DAX;
> > 
> >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
> >         inode->i_flags |= flags;
> > 
> > ends up being much easier to read and results in better code
> > generation. And we don't need to clear the S_DAX flag when "init" is
> > set, because we are starting from an inode that has no flags set
> > (because init!)...
> 
> This sounds good but I think we need a slight modification to make the function equivalent in functionality.
> 
> void
> xfs_diflags_to_iflags(
>         struct xfs_inode        *ip,
>         bool init)
> {
>         struct inode            *inode = VFS_I(ip);
>         unsigned int            xflags = xfs_ip2xflags(ip);
>         unsigned int            flags = 0;
> 
>         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
>                             S_DAX);

We don't want to clear the dax flag here, ever, if it is already
set. That is an externally visible change and opens us up (again) to
races where IS_DAX() changes half way through a fault path. IOWs, avoiding
clearing the DAX flag was something I did explicitly in the above
code fragment.

And it makes the logic clearer by pre-calculating the new flags,
then clearing and setting the inode flags together, rather than
having the spearated at the top and bottom of the function.

THis leads to an obvious conclusion: if we never clear the in memory
S_DAX flag, we can actually clear the on-disk flag safely, so that
next time the inode cycles into memory it won't be using DAX. IOWs,
admins can stop the applications, clear the DAX flag and drop
caches. This should result in the inode being recycled and when the
app is restarted it will run without DAX. No ned for deleting files,
copying large data sets, etc just to turn off an inode flag.

Cheers,

Dave.
Dan Williams April 8, 2020, 9:28 p.m. UTC | #4
On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote:
> > On Wed, Apr 08, 2020 at 12:08:27PM +1000, Dave Chinner wrote:
> > > On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote:
> > > > From: Ira Weiny <ira.weiny@intel.com>
> >
> > [snip]
> >
> > > >
> > > > -STATIC void
> > > > -xfs_diflags_to_linux(
> > > > - struct xfs_inode        *ip)
> > > > -{
> > > > - struct inode            *inode = VFS_I(ip);
> > > > - unsigned int            xflags = xfs_ip2xflags(ip);
> > > > -
> > > > - if (xflags & FS_XFLAG_IMMUTABLE)
> > > > -         inode->i_flags |= S_IMMUTABLE;
> > > > - else
> > > > -         inode->i_flags &= ~S_IMMUTABLE;
> > > > - if (xflags & FS_XFLAG_APPEND)
> > > > -         inode->i_flags |= S_APPEND;
> > > > - else
> > > > -         inode->i_flags &= ~S_APPEND;
> > > > - if (xflags & FS_XFLAG_SYNC)
> > > > -         inode->i_flags |= S_SYNC;
> > > > - else
> > > > -         inode->i_flags &= ~S_SYNC;
> > > > - if (xflags & FS_XFLAG_NOATIME)
> > > > -         inode->i_flags |= S_NOATIME;
> > > > - else
> > > > -         inode->i_flags &= ~S_NOATIME;
> > > > -#if 0    /* disabled until the flag switching races are sorted out */
> > > > - if (xflags & FS_XFLAG_DAX)
> > > > -         inode->i_flags |= S_DAX;
> > > > - else
> > > > -         inode->i_flags &= ~S_DAX;
> > > > -#endif
> > >
> > > So this variant will set the flag in the inode if the disk inode
> > > flag is set, otherwise it will clear it.  It does it with if/else
> > > branches.
> > >
> > >
> > > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > > > index e07f7b641226..a4ac8568c8c7 100644
> > > > --- a/fs/xfs/xfs_iops.c
> > > > +++ b/fs/xfs/xfs_iops.c
> > > > @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax(
> > > >   return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
> > > >  }
> > > >
> > > > -STATIC bool
> > > > +static bool
> > > >  xfs_inode_enable_dax(
> > > >   struct xfs_inode *ip)
> > > >  {
> > >
> > > This belongs in the previous patch.
> >
> > Ah yea...  Sorry.
> >
> > Fixed in V7
> >
> > >
> > > > @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax(
> > > >   return false;
> > > >  }
> > > >
> > > > -STATIC void
> > > > +void
> > > >  xfs_diflags_to_iflags(
> > > > - struct inode            *inode,
> > > > - struct xfs_inode        *ip)
> > > > + struct xfs_inode        *ip,
> > > > + bool init)
> > > >  {
> > > > - uint16_t                flags = ip->i_d.di_flags;
> > > > -
> > > > - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
> > > > -                     S_NOATIME | S_DAX);
> > >
> > > And this code cleared all the flags in the inode first, then
> > > set them if the disk inode flag is set. This does not require
> > > branches, resulting in more readable code and better code
> > > generation.
> > >
> > > > + struct inode            *inode = VFS_I(ip);
> > > > + uint                    diflags = xfs_ip2xflags(ip);
> > > >
> > > > - if (flags & XFS_DIFLAG_IMMUTABLE)
> > > > + if (diflags & FS_XFLAG_IMMUTABLE)
> > > >           inode->i_flags |= S_IMMUTABLE;
> > > > - if (flags & XFS_DIFLAG_APPEND)
> > > > + else
> > > > +         inode->i_flags &= ~S_IMMUTABLE;
> > >
> > > > + if (diflags & FS_XFLAG_APPEND)
> > > >           inode->i_flags |= S_APPEND;
> > > > - if (flags & XFS_DIFLAG_SYNC)
> > > > + else
> > > > +         inode->i_flags &= ~S_APPEND;
> > > > + if (diflags & FS_XFLAG_SYNC)
> > > >           inode->i_flags |= S_SYNC;
> > > > - if (flags & XFS_DIFLAG_NOATIME)
> > > > + else
> > > > +         inode->i_flags &= ~S_SYNC;
> > > > + if (diflags & FS_XFLAG_NOATIME)
> > > >           inode->i_flags |= S_NOATIME;
> > > > - if (xfs_inode_enable_dax(ip))
> > > > -         inode->i_flags |= S_DAX;
> > > > + else
> > > > +         inode->i_flags &= ~S_NOATIME;
> > > > +
> > > > + /* Only toggle the dax flag when initializing */
> > > > + if (init) {
> > > > +         if (xfs_inode_enable_dax(ip))
> > > > +                 inode->i_flags |= S_DAX;
> > > > +         else
> > > > +                 inode->i_flags &= ~S_DAX;
> > > > + }
> > > >  }
> > >
> > > IOWs, this:
> > >
> > >         struct inode            *inode = VFS_I(ip);
> > >         unsigned int            xflags = xfs_ip2xflags(ip);
> > >         unsigned int            flags = 0;
> > >
> > >         if (xflags & FS_XFLAG_IMMUTABLE)
> > >                 flags |= S_IMMUTABLE;
> > >         if (xflags & FS_XFLAG_APPEND)
> > >                 flags |= S_APPEND;
> > >         if (xflags & FS_XFLAG_SYNC)
> > >                 flags |= S_SYNC;
> > >         if (xflags & FS_XFLAG_NOATIME)
> > >                 flags |= S_NOATIME;
> > >     if ((xflags & FS_XFLAG_DAX) && init)
> > >             flags |= S_DAX;
> > >
> > >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
> > >         inode->i_flags |= flags;
> > >
> > > ends up being much easier to read and results in better code
> > > generation. And we don't need to clear the S_DAX flag when "init" is
> > > set, because we are starting from an inode that has no flags set
> > > (because init!)...
> >
> > This sounds good but I think we need a slight modification to make the function equivalent in functionality.
> >
> > void
> > xfs_diflags_to_iflags(
> >         struct xfs_inode        *ip,
> >         bool init)
> > {
> >         struct inode            *inode = VFS_I(ip);
> >         unsigned int            xflags = xfs_ip2xflags(ip);
> >         unsigned int            flags = 0;
> >
> >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
> >                             S_DAX);
>
> We don't want to clear the dax flag here, ever, if it is already
> set. That is an externally visible change and opens us up (again) to
> races where IS_DAX() changes half way through a fault path. IOWs, avoiding
> clearing the DAX flag was something I did explicitly in the above
> code fragment.
>
> And it makes the logic clearer by pre-calculating the new flags,
> then clearing and setting the inode flags together, rather than
> having the spearated at the top and bottom of the function.
>
> THis leads to an obvious conclusion: if we never clear the in memory
> S_DAX flag, we can actually clear the on-disk flag safely, so that
> next time the inode cycles into memory it won't be using DAX. IOWs,
> admins can stop the applications, clear the DAX flag and drop
> caches. This should result in the inode being recycled and when the
> app is restarted it will run without DAX. No ned for deleting files,
> copying large data sets, etc just to turn off an inode flag.

Makes sense, but is that sufficient? I recall you saying there might
be a multitude of other reasons that the inode is not evicted, not the
least of which is races [1]. Does this need another flag, lets call it
"dax toggle" to track the "I requested the inode to clear the flag,
but on cache-flush + restart the inode never got evicted" case. S_DAX
almost plays this role, but it loses the ability to audit which files
are pending an inode eviction event. So the dax-toggle flag indicates
to the kernel to xor the toggle value with the inode flag on inode
instantiation and the dax inode flag is never directly manipulated by
the ioctl path.

[1]: http://lore.kernel.org/r/20191025003603.GE4614@dread.disaster.area
Ira Weiny April 8, 2020, 10:07 p.m. UTC | #5
On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote:
> On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote:

[snip]

> > 
> > This sounds good but I think we need a slight modification to make the function equivalent in functionality.
> > 
> > void
> > xfs_diflags_to_iflags(
> >         struct xfs_inode        *ip,
> >         bool init)
> > {
> >         struct inode            *inode = VFS_I(ip);
> >         unsigned int            xflags = xfs_ip2xflags(ip);
> >         unsigned int            flags = 0;
> > 
> >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
> >                             S_DAX);
> 
> We don't want to clear the dax flag here, ever, if it is already
> set. That is an externally visible change and opens us up (again) to
> races where IS_DAX() changes half way through a fault path. IOWs, avoiding
> clearing the DAX flag was something I did explicitly in the above
> code fragment.

<sigh> yes... you are correct.

But I don't like depending on the caller to clear the S_DAX flag if
xfs_inode_enable_dax() is false.  IMO this function should clear the flag in
that case for consistency...

This is part of the reason I used the if/else logic from xfs_diflags_to_linux()
originally.  It is very explicit.

> 
> And it makes the logic clearer by pre-calculating the new flags,
> then clearing and setting the inode flags together, rather than
> having the spearated at the top and bottom of the function.

But this will not clear the S_DAX flag even if init is true.  To me that is a
potential for confusion down the road.

> 
> THis leads to an obvious conclusion: if we never clear the in memory
> S_DAX flag, we can actually clear the on-disk flag safely, so that
> next time the inode cycles into memory it won't be using DAX. IOWs,
> admins can stop the applications, clear the DAX flag and drop
> caches. This should result in the inode being recycled and when the
> app is restarted it will run without DAX. No ned for deleting files,
> copying large data sets, etc just to turn off an inode flag.

We already discussed evicting the inode and it was determined to be too
confusing.[*]

Furthermore, if we did want an interface like that why not allow the on-disk
flag to be set as well as cleared?

IMO, this function should set all of the flags consistently including S_DAX.

Ira

[*] https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Ira Weiny April 8, 2020, 10:10 p.m. UTC | #6
On Wed, Apr 08, 2020 at 02:28:30PM -0700, Dan Williams wrote:
> On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote:
> >

[snip]

> > >
> > > void
> > > xfs_diflags_to_iflags(
> > >         struct xfs_inode        *ip,
> > >         bool init)
> > > {
> > >         struct inode            *inode = VFS_I(ip);
> > >         unsigned int            xflags = xfs_ip2xflags(ip);
> > >         unsigned int            flags = 0;
> > >
> > >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
> > >                             S_DAX);
> >
> > We don't want to clear the dax flag here, ever, if it is already
> > set. That is an externally visible change and opens us up (again) to
> > races where IS_DAX() changes half way through a fault path. IOWs, avoiding
> > clearing the DAX flag was something I did explicitly in the above
> > code fragment.
> >
> > And it makes the logic clearer by pre-calculating the new flags,
> > then clearing and setting the inode flags together, rather than
> > having the spearated at the top and bottom of the function.
> >
> > THis leads to an obvious conclusion: if we never clear the in memory
> > S_DAX flag, we can actually clear the on-disk flag safely, so that
> > next time the inode cycles into memory it won't be using DAX. IOWs,
> > admins can stop the applications, clear the DAX flag and drop
> > caches. This should result in the inode being recycled and when the
> > app is restarted it will run without DAX. No ned for deleting files,
> > copying large data sets, etc just to turn off an inode flag.
> 
> Makes sense, but is that sufficient? I recall you saying there might
> be a multitude of other reasons that the inode is not evicted, not the
> least of which is races [1]. Does this need another flag, lets call it
> "dax toggle" to track the "I requested the inode to clear the flag,
> but on cache-flush + restart the inode never got evicted" case. S_DAX
> almost plays this role, but it loses the ability to audit which files
> are pending an inode eviction event. So the dax-toggle flag indicates
> to the kernel to xor the toggle value with the inode flag on inode
> instantiation and the dax inode flag is never directly manipulated by
> the ioctl path.
> 
> [1]: http://lore.kernel.org/r/20191025003603.GE4614@dread.disaster.area

FWIW I think we should continue down this simplified interface and get this
done for 5.8.  If we can come up with a way for delayed mode change I'm all for
looking into that.  But there has been too much controversy/difficulty about
changing the bit on a file.

So let's table this idea until >= 5.9

Ira
Dave Chinner April 8, 2020, 11:21 p.m. UTC | #7
On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote:
> On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote:
> > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote:
> 
> [snip]
> 
> > > 
> > > This sounds good but I think we need a slight modification to make the function equivalent in functionality.
> > > 
> > > void
> > > xfs_diflags_to_iflags(
> > >         struct xfs_inode        *ip,
> > >         bool init)
> > > {
> > >         struct inode            *inode = VFS_I(ip);
> > >         unsigned int            xflags = xfs_ip2xflags(ip);
> > >         unsigned int            flags = 0;
> > > 
> > >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
> > >                             S_DAX);
> > 
> > We don't want to clear the dax flag here, ever, if it is already
> > set. That is an externally visible change and opens us up (again) to
> > races where IS_DAX() changes half way through a fault path. IOWs, avoiding
> > clearing the DAX flag was something I did explicitly in the above
> > code fragment.
> 
> <sigh> yes... you are correct.
> 
> But I don't like depending on the caller to clear the S_DAX flag if
> xfs_inode_enable_dax() is false.  IMO this function should clear the flag in
> that case for consistency...

No. We simply cannot do that here except in the init case when the
inode is not yet visible to userspace. In which case, we know -for
certain- that the S_DAX is not set, and hence we do not need to
clear it. Initial conditions matter!

If you want to make sure of this, add this:

	ASSERT(!(IS_DAX(inode) && init));

And now we'll catch inodes that incorrectly have S_DAX set at init
time.

> > memory S_DAX flag, we can actually clear the on-disk flag
> > safely, so that next time the inode cycles into memory it won't
> > be using DAX. IOWs, admins can stop the applications, clear the
> > DAX flag and drop caches. This should result in the inode being
> > recycled and when the app is restarted it will run without DAX.
> > No ned for deleting files, copying large data sets, etc just to
> > turn off an inode flag.
> 
> We already discussed evicting the inode and it was determined to
> be too confusing.[*]

That discussion did not even consider how admins are supposed to
clear the inode flag once it is set on disk. It was entirely
focussed around "we can't change in memory S_DAX state" and how the
tri-state mount option to "override" the on-disk flag could be done.

Nobody noticed that being unable to rmeove the on-disk flag means
the admin's only option to turn off dax for an application is to
turn it off for everything, filesystem wide, which requires:

	1. stopping the app.
	2. stopping every other app using the filesystem
	3. unmounting the filesystem
	4. changing to dax=never mount option
	5. mounting the filesystem
	6. restarting all apps.

It's a hard stop for everything using the filesystem, and it changes
the runtime environment for all applications, not just the one that
needs DAX turned off.  Not to mention that if it's the root
filesystem that is using DAX, then it's a full system reboot needed
to change the mount options.

IMO, this is a non-starter from a production point of view - testing
and qualification of all applications rather than just the affected
app is required to make this sort of change.  It simply does not
follow the "minimal change to fix the problem" rules for managing
issues in production environments.

So, pLease explain to me how this process:

	1. stop the app
	2. remove inode flags via xfs_io
	3. run drop_caches
	4. start the app

is worse than requiring admins to unmount the filesystem to turn off
DAX for an application.

> Furthermore, if we did want an interface like that why not allow
> the on-disk flag to be set as well as cleared?

Well, why not - it's why I implemented the flag in the first place!
The only problem we have here is how to safely change the in-memory
DAX state, and that largely has nothing to do with setting/clearing
the on-disk flag....

Cheers,

Dave.
Dave Chinner April 8, 2020, 11:58 p.m. UTC | #8
On Wed, Apr 08, 2020 at 02:28:30PM -0700, Dan Williams wrote:
> On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote:
> > THis leads to an obvious conclusion: if we never clear the in memory
> > S_DAX flag, we can actually clear the on-disk flag safely, so that
> > next time the inode cycles into memory it won't be using DAX. IOWs,
> > admins can stop the applications, clear the DAX flag and drop
> > caches. This should result in the inode being recycled and when the
> > app is restarted it will run without DAX. No ned for deleting files,
> > copying large data sets, etc just to turn off an inode flag.
> 
> Makes sense, but is that sufficient? I recall you saying there might
> be a multitude of other reasons that the inode is not evicted, not the
> least of which is races [1]. Does this need another flag, lets call it
> "dax toggle" to track the "I requested the inode to clear the flag,
> but on cache-flush + restart the inode never got evicted" case.

You mean something like XFS_IDONTCACHE?

i.e. the functionality already exists in XFS to selectively evict an
inode from cache when the last reference to it is dropped rather
than let it go to the LRUs and hang around in memory.

That flag can be set when changing the on disk DAX flag, and we can
tweak how it works so new cache hits don't clear it (as happens
now). Hence the only thing that can prevent eviction are active
references.

That means we'll still need to stop the application and drop_caches,
because we need to close all the files and purge the dentries that
hold references to the inode before it can be evicted.

Cheers,

Dave.
Ira Weiny April 9, 2020, 12:12 a.m. UTC | #9
On Thu, Apr 09, 2020 at 09:21:06AM +1000, Dave Chinner wrote:
> On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote:
> > On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote:
> > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote:
> > 
> > [snip]
> > 
> > > > 
> > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality.
> > > > 
> > > > void
> > > > xfs_diflags_to_iflags(
> > > >         struct xfs_inode        *ip,
> > > >         bool init)
> > > > {
> > > >         struct inode            *inode = VFS_I(ip);
> > > >         unsigned int            xflags = xfs_ip2xflags(ip);
> > > >         unsigned int            flags = 0;
> > > > 
> > > >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
> > > >                             S_DAX);
> > > 
> > > We don't want to clear the dax flag here, ever, if it is already
> > > set. That is an externally visible change and opens us up (again) to
> > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding
> > > clearing the DAX flag was something I did explicitly in the above
> > > code fragment.
> > 
> > <sigh> yes... you are correct.
> > 
> > But I don't like depending on the caller to clear the S_DAX flag if
> > xfs_inode_enable_dax() is false.  IMO this function should clear the flag in
> > that case for consistency...
> 
> No. We simply cannot do that here except in the init case when the
> inode is not yet visible to userspace. In which case, we know -for
> certain- that the S_DAX is not set, and hence we do not need to
> clear it. Initial conditions matter!
> 
> If you want to make sure of this, add this:
> 
> 	ASSERT(!(IS_DAX(inode) && init));
> 
> And now we'll catch inodes that incorrectly have S_DAX set at init
> time.

Ok, that will work.  Also documents that expected initial condition.

> 
> > > memory S_DAX flag, we can actually clear the on-disk flag
> > > safely, so that next time the inode cycles into memory it won't
> > > be using DAX. IOWs, admins can stop the applications, clear the
> > > DAX flag and drop caches. This should result in the inode being
> > > recycled and when the app is restarted it will run without DAX.
> > > No ned for deleting files, copying large data sets, etc just to
> > > turn off an inode flag.
> > 
> > We already discussed evicting the inode and it was determined to
> > be too confusing.[*]
> 
> That discussion did not even consider how admins are supposed to
> clear the inode flag once it is set on disk.

I think this is a bit unfair.  I think we were all considering it...  and I
still think this patch set is a step in the right direction.

> It was entirely
> focussed around "we can't change in memory S_DAX state"

Not true.  There were many ideas on how to change the FS_XFLAG_DAX with some
sort of delayed S_DAX state to avoid changing S_DAX on an in memory inode.

I made the comment:

	"... I want to clarify.  ...  we could have the flag change with an
	appropriate error which could let the user know the change has been
	delayed."

	-- https://lore.kernel.org/lkml/20200402205518.GF3952565@iweiny-DESK2.sc.intel.com/

Jan made multiple comments:

	"I generally like the proposal but I think the fact that toggling
	FS_XFLAG_DAX will not have immediate effect on S_DAX will cause quite
	some confusion and ultimately bug reports."

	-- https://lore.kernel.org/lkml/20200401102511.GC19466@quack2.suse.cz/


	"Just switch FS_XFLAG_DAX flag, S_DAX flag will magically switch when
	inode gets evicted and the inode gets reloaded from the disk again. Did
	I misunderstand anything?

	And my thinking was that this is surprising behavior for the user and
	so it will likely generate lots of bug reports along the lines of "DAX
	inode flag does not work!"."

	-- https://lore.kernel.org/lkml/20200403170338.GD29920@quack2.suse.cz/

Darrick also had similar ideas/comments.

Christoph did say:

	"A reasonably smart application can try to evict itself."

	-- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/

Which I was unclear about???

Christoph does this mean you would be ok with changing the FS_XFLAG_DAX on disk
and letting S_DAX change later?

> and how the
> tri-state mount option to "override" the on-disk flag could be done.
> 
> Nobody noticed that being unable to rmeove the on-disk flag means
> the admin's only option to turn off dax for an application is to
> turn it off for everything, filesystem wide, which requires:

No. This is not entirely true.  While I don't like the idea of having to copy
data (and I agree with your points) it is possible to do.

> 
> 	1. stopping the app.
> 	2. stopping every other app using the filesystem
> 	3. unmounting the filesystem
> 	4. changing to dax=never mount option

I don't understand why we need to unmount and mount with dax=never?

> 	5. mounting the filesystem
> 	6. restarting all apps.
> 
> It's a hard stop for everything using the filesystem, and it changes
> the runtime environment for all applications, not just the one that
> needs DAX turned off.  Not to mention that if it's the root
> filesystem that is using DAX, then it's a full system reboot needed
> to change the mount options.
> 
> IMO, this is a non-starter from a production point of view - testing
> and qualification of all applications rather than just the affected
> app is required to make this sort of change.  It simply does not
> follow the "minimal change to fix the problem" rules for managing
> issues in production environments.
> 
> So, pLease explain to me how this process:
> 
> 	1. stop the app
> 	2. remove inode flags via xfs_io
> 	3. run drop_caches
> 	4. start the app
> 
> is worse than requiring admins to unmount the filesystem to turn off
> DAX for an application.

Jan?  Christoph?

> 
> > Furthermore, if we did want an interface like that why not allow
> > the on-disk flag to be set as well as cleared?
> 
> Well, why not - it's why I implemented the flag in the first place!
> The only problem we have here is how to safely change the in-memory
> DAX state, and that largely has nothing to do with setting/clearing
> the on-disk flag....

With the above change to xfs_diflags_to_iflags() I think we are ok here.

Ira
Ira Weiny April 9, 2020, 12:22 a.m. UTC | #10
On Thu, Apr 09, 2020 at 09:58:36AM +1000, Dave Chinner wrote:
> On Wed, Apr 08, 2020 at 02:28:30PM -0700, Dan Williams wrote:
> > On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote:
> > > THis leads to an obvious conclusion: if we never clear the in memory
> > > S_DAX flag, we can actually clear the on-disk flag safely, so that
> > > next time the inode cycles into memory it won't be using DAX. IOWs,
> > > admins can stop the applications, clear the DAX flag and drop
> > > caches. This should result in the inode being recycled and when the
> > > app is restarted it will run without DAX. No ned for deleting files,
> > > copying large data sets, etc just to turn off an inode flag.
> > 
> > Makes sense, but is that sufficient? I recall you saying there might
> > be a multitude of other reasons that the inode is not evicted, not the
> > least of which is races [1]. Does this need another flag, lets call it
> > "dax toggle" to track the "I requested the inode to clear the flag,
> > but on cache-flush + restart the inode never got evicted" case.
> 
> You mean something like XFS_IDONTCACHE?
> 
> i.e. the functionality already exists in XFS to selectively evict an
> inode from cache when the last reference to it is dropped rather
> than let it go to the LRUs and hang around in memory.
> 
> That flag can be set when changing the on disk DAX flag, and we can
> tweak how it works so new cache hits don't clear it (as happens
> now). Hence the only thing that can prevent eviction are active
> references.
> 
> That means we'll still need to stop the application and drop_caches,
> because we need to close all the files and purge the dentries that
> hold references to the inode before it can be evicted.

That sounds like a great idea...

Jan?  Christoph?

I will reiterate though that any delayed S_DAX change be done as a follow on
series to the one proposed here.

Ira
Darrick J. Wong April 9, 2020, 12:30 a.m. UTC | #11
On Wed, Apr 08, 2020 at 05:12:06PM -0700, Ira Weiny wrote:
> On Thu, Apr 09, 2020 at 09:21:06AM +1000, Dave Chinner wrote:
> > On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote:
> > > On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote:
> > > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote:
> > > 
> > > [snip]
> > > 
> > > > > 
> > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality.
> > > > > 
> > > > > void
> > > > > xfs_diflags_to_iflags(
> > > > >         struct xfs_inode        *ip,
> > > > >         bool init)
> > > > > {
> > > > >         struct inode            *inode = VFS_I(ip);
> > > > >         unsigned int            xflags = xfs_ip2xflags(ip);
> > > > >         unsigned int            flags = 0;
> > > > > 
> > > > >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
> > > > >                             S_DAX);
> > > > 
> > > > We don't want to clear the dax flag here, ever, if it is already
> > > > set. That is an externally visible change and opens us up (again) to
> > > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding
> > > > clearing the DAX flag was something I did explicitly in the above
> > > > code fragment.
> > > 
> > > <sigh> yes... you are correct.
> > > 
> > > But I don't like depending on the caller to clear the S_DAX flag if
> > > xfs_inode_enable_dax() is false.  IMO this function should clear the flag in
> > > that case for consistency...
> > 
> > No. We simply cannot do that here except in the init case when the
> > inode is not yet visible to userspace. In which case, we know -for
> > certain- that the S_DAX is not set, and hence we do not need to
> > clear it. Initial conditions matter!
> > 
> > If you want to make sure of this, add this:
> > 
> > 	ASSERT(!(IS_DAX(inode) && init));
> > 
> > And now we'll catch inodes that incorrectly have S_DAX set at init
> > time.
> 
> Ok, that will work.  Also documents that expected initial condition.
> 
> > 
> > > > memory S_DAX flag, we can actually clear the on-disk flag
> > > > safely, so that next time the inode cycles into memory it won't
> > > > be using DAX. IOWs, admins can stop the applications, clear the
> > > > DAX flag and drop caches. This should result in the inode being
> > > > recycled and when the app is restarted it will run without DAX.
> > > > No ned for deleting files, copying large data sets, etc just to
> > > > turn off an inode flag.
> > > 
> > > We already discussed evicting the inode and it was determined to
> > > be too confusing.[*]
> > 
> > That discussion did not even consider how admins are supposed to
> > clear the inode flag once it is set on disk.
> 
> I think this is a bit unfair.  I think we were all considering it...  and I
> still think this patch set is a step in the right direction.
> 
> > It was entirely
> > focussed around "we can't change in memory S_DAX state"
> 
> Not true.  There were many ideas on how to change the FS_XFLAG_DAX with some
> sort of delayed S_DAX state to avoid changing S_DAX on an in memory inode.
> 
> I made the comment:
> 
> 	"... I want to clarify.  ...  we could have the flag change with an
> 	appropriate error which could let the user know the change has been
> 	delayed."
> 
> 	-- https://lore.kernel.org/lkml/20200402205518.GF3952565@iweiny-DESK2.sc.intel.com/
> 
> Jan made multiple comments:
> 
> 	"I generally like the proposal but I think the fact that toggling
> 	FS_XFLAG_DAX will not have immediate effect on S_DAX will cause quite
> 	some confusion and ultimately bug reports."
> 
> 	-- https://lore.kernel.org/lkml/20200401102511.GC19466@quack2.suse.cz/
> 
> 
> 	"Just switch FS_XFLAG_DAX flag, S_DAX flag will magically switch when
> 	inode gets evicted and the inode gets reloaded from the disk again. Did
> 	I misunderstand anything?
> 
> 	And my thinking was that this is surprising behavior for the user and
> 	so it will likely generate lots of bug reports along the lines of "DAX
> 	inode flag does not work!"."
> 
> 	-- https://lore.kernel.org/lkml/20200403170338.GD29920@quack2.suse.cz/
> 
> Darrick also had similar ideas/comments.
> 
> Christoph did say:
> 
> 	"A reasonably smart application can try to evict itself."
> 
> 	-- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/
> 
> Which I was unclear about???
> 
> Christoph does this mean you would be ok with changing the FS_XFLAG_DAX on disk
> and letting S_DAX change later?
> 
> > and how the
> > tri-state mount option to "override" the on-disk flag could be done.
> > 
> > Nobody noticed that being unable to rmeove the on-disk flag means
> > the admin's only option to turn off dax for an application is to
> > turn it off for everything, filesystem wide, which requires:
> 
> No. This is not entirely true.  While I don't like the idea of having to copy
> data (and I agree with your points) it is possible to do.

But now that I think about it, that's really going to be a PITA, and
probably more of a pain than if the two DAX flags are only loosely
coupled.

> > 
> > 	1. stopping the app.
> > 	2. stopping every other app using the filesystem
> > 	3. unmounting the filesystem
> > 	4. changing to dax=never mount option
> 
> I don't understand why we need to unmount and mount with dax=never?

I've realized that if you can /never/ clear FS_XFLAG_DAX from a file,
then the only way to force it off is dax=never (so the kernel ignores
it) or move the fs to a non-pmem storage (so the kernel doesn't even
try).

> > 	5. mounting the filesystem
> > 	6. restarting all apps.
> > 
> > It's a hard stop for everything using the filesystem, and it changes
> > the runtime environment for all applications, not just the one that
> > needs DAX turned off.  Not to mention that if it's the root
> > filesystem that is using DAX, then it's a full system reboot needed
> > to change the mount options.
> > 
> > IMO, this is a non-starter from a production point of view - testing
> > and qualification of all applications rather than just the affected
> > app is required to make this sort of change.  It simply does not
> > follow the "minimal change to fix the problem" rules for managing
> > issues in production environments.
> > 
> > So, pLease explain to me how this process:
> > 
> > 	1. stop the app
> > 	2. remove inode flags via xfs_io
> > 	3. run drop_caches
> > 	4. start the app
> > 
> > is worse than requiring admins to unmount the filesystem to turn off
> > DAX for an application.
> 
> Jan?  Christoph?

But you're right, this thing keeps swirling around and around and around
because we can't ever get to agreement on this.  Maybe I'll just become
XFS BOFH MAINTAINER and make a decision like this:

 1 Applications must call statx to discover the current S_DAX state.

 2 There exists an advisory file inode flag FS_XFLAG_DAX that is set based on
   the parent directory FS_XFLAG_DAX inode flag.  This advisory flag can be
   changed after file creation, but it does not immediately affect the S_DAX
   state.

   If FS_XFLAG_DAX is set and the fs is on pmem then it will enable S_DAX at
   inode load time; if FS_XFLAG_DAX is not set, it will not enable S_DAX.
   Unless overridden...

 3 There exists a dax= mount option.

   "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX"
   "-o dax=always" means "always set S_DAX (at least on pmem), ignore FS_XFLAG_DAX"
        "-o dax" by itself means "dax=always"
   "-o dax=iflag" means "follow FS_XFLAG_DAX" and is the default

 4 There exists an advisory directory inode flag FS_XFLAG_DAX that can be
   changed at any time.  The flag state is copied into any files or
   subdirectories when they are created within that directory.  If programs
   require file access runs in S_DAX mode, they must create those files
   inside a directory with FS_XFLAG_DAX set, or mount the fs with an
   appropriate dax mount option.

 5 Programs that require a specific file access mode (DAX or not DAX) must
   do one of the following:

   (a) create files in directories with the FS_XFLAG_DAX flag set as needed;

   (b) have the administrator set an override via mount option;

   (c) if they need to change a file's FS_XFLAG_DAX flag so that it does not
       match the S_DAX state (as reported by statx), they must cause the
       kernel to evict the inode from memory.  This can be done by:

       i>   closing the file;
       ii>  re-opening the file and using statx to see if the fs has
            changed the S_DAX flag;
       iii> if not, either unmount and remount the filesystem, or
            closing the file and using drop_caches.

 6 I no longer think it's too wild to require that users who want to
   squeeze every last bit of performance out of the particular rough and
   tumble bits of their storage also be exposed to the difficulties of
   what happens when the operating system can't totally virtualize those
   hardware capabilities.  Your high performance sports car is not a
   Toyota minivan, as it were.

I think (like Dave said) that if you set XFS_IDONTCACHE on the inode
when you change the DAX flag, the VFS will kill the inode the instant
the last user close()s the file.  Then 5.c.ii will actually work.

--D

> > 
> > > Furthermore, if we did want an interface like that why not allow
> > > the on-disk flag to be set as well as cleared?
> > 
> > Well, why not - it's why I implemented the flag in the first place!
> > The only problem we have here is how to safely change the in-memory
> > DAX state, and that largely has nothing to do with setting/clearing
> > the on-disk flag....
> 
> With the above change to xfs_diflags_to_iflags() I think we are ok here.
> 
> Ira
>
Dave Chinner April 9, 2020, 12:49 a.m. UTC | #12
On Wed, Apr 08, 2020 at 05:12:06PM -0700, Ira Weiny wrote:
> On Thu, Apr 09, 2020 at 09:21:06AM +1000, Dave Chinner wrote:
> > On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote:
> > > On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote:
> > > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote:
> > > 
> > > [snip]
> > > 
> > > > > 
> > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality.
> > > > > 
> > > > > void
> > > > > xfs_diflags_to_iflags(
> > > > >         struct xfs_inode        *ip,
> > > > >         bool init)
> > > > > {
> > > > >         struct inode            *inode = VFS_I(ip);
> > > > >         unsigned int            xflags = xfs_ip2xflags(ip);
> > > > >         unsigned int            flags = 0;
> > > > > 
> > > > >         inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME |
> > > > >                             S_DAX);
> > > > 
> > > > We don't want to clear the dax flag here, ever, if it is already
> > > > set. That is an externally visible change and opens us up (again) to
> > > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding
> > > > clearing the DAX flag was something I did explicitly in the above
> > > > code fragment.
> > > 
> > > <sigh> yes... you are correct.
> > > 
> > > But I don't like depending on the caller to clear the S_DAX flag if
> > > xfs_inode_enable_dax() is false.  IMO this function should clear the flag in
> > > that case for consistency...
> > 
> > No. We simply cannot do that here except in the init case when the
> > inode is not yet visible to userspace. In which case, we know -for
> > certain- that the S_DAX is not set, and hence we do not need to
> > clear it. Initial conditions matter!
> > 
> > If you want to make sure of this, add this:
> > 
> > 	ASSERT(!(IS_DAX(inode) && init));
> > 
> > And now we'll catch inodes that incorrectly have S_DAX set at init
> > time.
> 
> Ok, that will work.  Also documents that expected initial condition.
> 
> > 
> > > > memory S_DAX flag, we can actually clear the on-disk flag
> > > > safely, so that next time the inode cycles into memory it won't
> > > > be using DAX. IOWs, admins can stop the applications, clear the
> > > > DAX flag and drop caches. This should result in the inode being
> > > > recycled and when the app is restarted it will run without DAX.
> > > > No ned for deleting files, copying large data sets, etc just to
> > > > turn off an inode flag.
> > > 
> > > We already discussed evicting the inode and it was determined to
> > > be too confusing.[*]
> > 
> > That discussion did not even consider how admins are supposed to
> > clear the inode flag once it is set on disk.
> 
> I think this is a bit unfair.  I think we were all considering it...  and I
> still think this patch set is a step in the right direction.
> 
> > It was entirely
> > focussed around "we can't change in memory S_DAX state"
> 
> Not true.  There were many ideas on how to change the FS_XFLAG_DAX with some
> sort of delayed S_DAX state to avoid changing S_DAX on an in memory inode.
> 
> I made the comment:
> 
> 	"... I want to clarify.  ...  we could have the flag change with an
> 	appropriate error which could let the user know the change has been
> 	delayed."
> 
> 	-- https://lore.kernel.org/lkml/20200402205518.GF3952565@iweiny-DESK2.sc.intel.com/
> 
> Jan made multiple comments:
> 
> 	"I generally like the proposal but I think the fact that toggling
> 	FS_XFLAG_DAX will not have immediate effect on S_DAX will cause quite
> 	some confusion and ultimately bug reports."
> 
> 	-- https://lore.kernel.org/lkml/20200401102511.GC19466@quack2.suse.cz/
> 
> 
> 	"Just switch FS_XFLAG_DAX flag, S_DAX flag will magically switch when
> 	inode gets evicted and the inode gets reloaded from the disk again. Did
> 	I misunderstand anything?
> 
> 	And my thinking was that this is surprising behavior for the user and
> 	so it will likely generate lots of bug reports along the lines of "DAX
> 	inode flag does not work!"."
> 
> 	-- https://lore.kernel.org/lkml/20200403170338.GD29920@quack2.suse.cz/
> 
> Darrick also had similar ideas/comments.

None of these consider how the admin is supposed to remove
the flag once it is set. They all talk about issues that result
from setting/clearing the flag inside the kernel, and don't consider
the administration impacts of having an unclearable flag on disk
that the kernel uses for operational policy decisions.

Nobody said "hey, wait, if we don't allow the flag to be cleared
once the flag is set, how do we actually remove it so the admin can
stop overriding them globally with the mount option? If the kernel
can't clear the flag, then what mechanism are we going to provide to
let them clear it without interrupting production?"

> Christoph did say:
> 
> 	"A reasonably smart application can try to evict itself."
> 
> 	-- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/

I'd love to know how an unprivileged application can force the
eviction of an inode from cache. If the application cannot evict
files it is using reliably, then it's no better solution than
drop_caches. And given that userspace has to take references to
files to access them, by definition a file that userspace is trying
to evict will have active references and hence cannot be evicted.

However, if userspace can reliably evict inodes that it can access,
then we have a timing attack vector that we need to address.  i.e.
by now everyone here should know that user controlled cache eviction
is the basic requirement for creating most OS and CPU level timing
attacks....

> > and how the
> > tri-state mount option to "override" the on-disk flag could be done.
> > 
> > Nobody noticed that being unable to rmeove the on-disk flag means
> > the admin's only option to turn off dax for an application is to
> > turn it off for everything, filesystem wide, which requires:
> 
> No. This is not entirely true.  While I don't like the idea of having to copy
> data (and I agree with your points) it is possible to do.
> 
> > 
> > 	1. stopping the app.
> > 	2. stopping every other app using the filesystem
> > 	3. unmounting the filesystem
> > 	4. changing to dax=never mount option
> 
> I don't understand why we need to unmount and mount with dax=never?

1. IIUC your patches correctly, that's exactly how you implemented
the dax=... mount option.

2. If you don't unmount the filesystem and only require a remount,
then changing the mount option has all the same "delayed effect"
issues that changing the on-disk flag has. i.e. We can't change the
in-memory inode state until the inode has been evicted from cache
and a remount doesn't guarantee that cache eviction will succeed.

Cheers,

Dave.
Christoph Hellwig April 9, 2020, 12:40 p.m. UTC | #13
On Thu, Apr 09, 2020 at 10:49:21AM +1000, Dave Chinner wrote:
> > Christoph did say:
> > 
> > 	"A reasonably smart application can try to evict itself."
> > 
> > 	-- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/
> 
> I'd love to know how an unprivileged application can force the
> eviction of an inode from cache.

Where did the "unprivileged" suddenly come from?
Christoph Hellwig April 9, 2020, 12:41 p.m. UTC | #14
On Wed, Apr 08, 2020 at 05:22:03PM -0700, Ira Weiny wrote:
> > You mean something like XFS_IDONTCACHE?
> > 
> > i.e. the functionality already exists in XFS to selectively evict an
> > inode from cache when the last reference to it is dropped rather
> > than let it go to the LRUs and hang around in memory.
> > 
> > That flag can be set when changing the on disk DAX flag, and we can
> > tweak how it works so new cache hits don't clear it (as happens
> > now). Hence the only thing that can prevent eviction are active
> > references.
> > 
> > That means we'll still need to stop the application and drop_caches,
> > because we need to close all the files and purge the dentries that
> > hold references to the inode before it can be evicted.
> 
> That sounds like a great idea...
> 
> Jan?  Christoph?

Sounds ok.  Note that we could also pretty trivially lift
XFS_IDONTCACHE to the VFS if we need to apply the same scheme to
ext4.
Ira Weiny April 9, 2020, 3:29 p.m. UTC | #15
On Wed, Apr 08, 2020 at 05:30:21PM -0700, Darrick J. Wong wrote:

[snip]

> 
> But you're right, this thing keeps swirling around and around and around
> because we can't ever get to agreement on this.  Maybe I'll just become
> XFS BOFH MAINTAINER and make a decision like this:
> 
>  1 Applications must call statx to discover the current S_DAX state.
> 
>  2 There exists an advisory file inode flag FS_XFLAG_DAX that is set based on
>    the parent directory FS_XFLAG_DAX inode flag.  This advisory flag can be
>    changed after file creation, but it does not immediately affect the S_DAX
>    state.
> 
>    If FS_XFLAG_DAX is set and the fs is on pmem then it will enable S_DAX at
>    inode load time; if FS_XFLAG_DAX is not set, it will not enable S_DAX.
>    Unless overridden...
> 
>  3 There exists a dax= mount option.
> 
>    "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX"
>    "-o dax=always" means "always set S_DAX (at least on pmem), ignore FS_XFLAG_DAX"
>         "-o dax" by itself means "dax=always"
>    "-o dax=iflag" means "follow FS_XFLAG_DAX" and is the default

per-Dave '-o dax=inode'

> 
>  4 There exists an advisory directory inode flag FS_XFLAG_DAX that can be
>    changed at any time.  The flag state is copied into any files or
>    subdirectories when they are created within that directory.

Good.

>    If programs
>    require file access runs in S_DAX mode, they must create those files
>    inside a directory with FS_XFLAG_DAX set, or mount the fs with an
>    appropriate dax mount option.

Why do we need this to be true?  If the FS_XFLAG_DAX flag can be cleared why
not set it and allow the S_DAX change to occur later just like clearing it?
The logic is exactly the same.

> 
>  5 Programs that require a specific file access mode (DAX or not DAX) must

s/must/can/

>    do one of the following:
> 
>    (a) create files in directories with the FS_XFLAG_DAX flag set as needed;

Again if we allow clearing the flag why not setting?  So this is 1 option they
'can' do.

> 
>    (b) have the administrator set an override via mount option;
> 
>    (c) if they need to change a file's FS_XFLAG_DAX flag so that it does not
>        match the S_DAX state (as reported by statx), they must cause the
>        kernel to evict the inode from memory.  This can be done by:
> 
>        i>   closing the file;
>        ii>  re-opening the file and using statx to see if the fs has
>             changed the S_DAX flag;

i and ii need to be 1 step the user must follow.

>        iii> if not, either unmount and remount the filesystem, or
>             closing the file and using drop_caches.
> 
>  6 I no longer think it's too wild to require that users who want to
>    squeeze every last bit of performance out of the particular rough and
>    tumble bits of their storage also be exposed to the difficulties of
>    what happens when the operating system can't totally virtualize those
>    hardware capabilities.  Your high performance sports car is not a
>    Toyota minivan, as it were.

I'm good with this statement.  But I think we need to clean up the verbiage for
the documentation...  ;-)

Thanks for the summary.  I like these to get everyone on the same page.  :-D
Ira

> 
> I think (like Dave said) that if you set XFS_IDONTCACHE on the inode
> when you change the DAX flag, the VFS will kill the inode the instant
> the last user close()s the file.  Then 5.c.ii will actually work.
> 
> --D
> 
> > > 
> > > > Furthermore, if we did want an interface like that why not allow
> > > > the on-disk flag to be set as well as cleared?
> > > 
> > > Well, why not - it's why I implemented the flag in the first place!
> > > The only problem we have here is how to safely change the in-memory
> > > DAX state, and that largely has nothing to do with setting/clearing
> > > the on-disk flag....
> > 
> > With the above change to xfs_diflags_to_iflags() I think we are ok here.
> > 
> > Ira
> >
Darrick J. Wong April 9, 2020, 4:59 p.m. UTC | #16
On Thu, Apr 09, 2020 at 08:29:44AM -0700, Ira Weiny wrote:
> On Wed, Apr 08, 2020 at 05:30:21PM -0700, Darrick J. Wong wrote:
> 
> [snip]
> 
> > 
> > But you're right, this thing keeps swirling around and around and around
> > because we can't ever get to agreement on this.  Maybe I'll just become
> > XFS BOFH MAINTAINER and make a decision like this:
> > 
> >  1 Applications must call statx to discover the current S_DAX state.
> > 
> >  2 There exists an advisory file inode flag FS_XFLAG_DAX that is set based on
> >    the parent directory FS_XFLAG_DAX inode flag.  This advisory flag can be
> >    changed after file creation, but it does not immediately affect the S_DAX
> >    state.
> > 
> >    If FS_XFLAG_DAX is set and the fs is on pmem then it will enable S_DAX at
> >    inode load time; if FS_XFLAG_DAX is not set, it will not enable S_DAX.
> >    Unless overridden...
> > 
> >  3 There exists a dax= mount option.
> > 
> >    "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX"
> >    "-o dax=always" means "always set S_DAX (at least on pmem), ignore FS_XFLAG_DAX"
> >         "-o dax" by itself means "dax=always"
> >    "-o dax=iflag" means "follow FS_XFLAG_DAX" and is the default
> 
> per-Dave '-o dax=inode'

Ok.

> > 
> >  4 There exists an advisory directory inode flag FS_XFLAG_DAX that can be
> >    changed at any time.  The flag state is copied into any files or
> >    subdirectories when they are created within that directory.
> 
> Good.
> 
> >    If programs
> >    require file access runs in S_DAX mode, they must create those files
> >    inside a directory with FS_XFLAG_DAX set, or mount the fs with an
> >    appropriate dax mount option.
> 
> Why do we need this to be true?  If the FS_XFLAG_DAX flag can be cleared why
> not set it and allow the S_DAX change to occur later just like clearing it?
> The logic is exactly the same.

I think I'll just delete this sentence since I started pushing all that
verbiage towards (5).

To answer your question, yes, FS_XFLAG_DAX can be set on a file at any
time, the same as it can be cleared at any time.  Sorry that was
unclear, I'll fix that for the next draft (below).

> > 
> >  5 Programs that require a specific file access mode (DAX or not DAX) must
> 
> s/must/can/

Ok.

> >    do one of the following:
> > 
> >    (a) create files in directories with the FS_XFLAG_DAX flag set as needed;
> 
> Again if we allow clearing the flag why not setting?  So this is 1 option they
> 'can' do.
> 
> > 
> >    (b) have the administrator set an override via mount option;
> > 
> >    (c) if they need to change a file's FS_XFLAG_DAX flag so that it does not
> >        match the S_DAX state (as reported by statx), they must cause the
> >        kernel to evict the inode from memory.  This can be done by:
> > 
> >        i>   closing the file;
> >        ii>  re-opening the file and using statx to see if the fs has
> >             changed the S_DAX flag;
> 
> i and ii need to be 1 step the user must follow.

Ok, I'll combine these.

> >        iii> if not, either unmount and remount the filesystem, or
> >             closing the file and using drop_caches.
> > 
> >  6 I no longer think it's too wild to require that users who want to
> >    squeeze every last bit of performance out of the particular rough and
> >    tumble bits of their storage also be exposed to the difficulties of
> >    what happens when the operating system can't totally virtualize those
> >    hardware capabilities.  Your high performance sports car is not a
> >    Toyota minivan, as it were.
> 
> I'm good with this statement.  But I think we need to clean up the verbiage for
> the documentation...  ;-)

Heh. :)

> Thanks for the summary.  I like these to get everyone on the same page.  :-D

And today:

 1. There exists an in-kernel access mode flag S_DAX that is set when
    file accesses go directly to persistent memory, bypassing the page
    cache.  Applications must call statx to discover the current S_DAX
    state (STATX_ATTR_DAX).

 2. There exists an advisory file inode flag FS_XFLAG_DAX that is
    inherited from the parent directory FS_XFLAG_DAX inode flag at file
    creation time.  This advisory flag can be set or cleared at any
    time, but doing so does not immediately affect the S_DAX state.

    Unless overridden by mount options (see (3)), if FS_XFLAG_DAX is set
    and the fs is on pmem then it will enable S_DAX at inode load time;
    if FS_XFLAG_DAX is not set, it will not enable S_DAX.

 3. There exists a dax= mount option.

    "-o dax=never"  means "never set S_DAX, ignore FS_XFLAG_DAX."

    "-o dax=always" means "always set S_DAX (at least on pmem),
                    and ignore FS_XFLAG_DAX."

    "-o dax"        is an alias for "dax=always".

    "-o dax=inode"  means "follow FS_XFLAG_DAX" and is the default.

 4. There exists an advisory directory inode flag FS_XFLAG_DAX that can
    be set or cleared at any time.  The flag state is copied into any
    files or subdirectories when they are created within that directory.

 5. Programs that require a specific file access mode (DAX or not DAX)
    can do one of the following:

    (a) Create files in directories that the FS_XFLAG_DAX flag set as
        needed; or

    (b) Have the administrator set an override via mount option; or

    (c) Set or clear the file's FS_XFLAG_DAX flag as needed.  Programs
        must then cause the kernel to evict the inode from memory.  This
        can be done by:

        i>  Closing the file and re-opening the file and using statx to
            see if the fs has changed the S_DAX flag; and

        ii> If the file still does not have the desired S_DAX access
            mode, either unmount and remount the filesystem, or close
            the file and use drop_caches.

 6. It's not unreasonable that users who want to squeeze every last bit
    of performance out of the particular rough and tumble bits of their
    storage also be exposed to the difficulties of what happens when the
    operating system can't totally virtualize those hardware
    capabilities.  Your high performance sports car is not a Toyota
    minivan, as it were.

Given our overnight discussions, I don't think it'll be difficult to
hoist XFS_IDONTCACHE to the VFS so that 5.c.i is enough to change the
S_DAX state if nobody else is using the file.

--D

> > 
> > > Furthermore, if we did want an interface like that why not allow
> > > the on-disk flag to be set as well as cleared?
> > 
> > Well, why not - it's why I implemented the flag in the first place!
> > The only problem we have here is how to safely change the in-memory
> > DAX state, and that largely has nothing to do with setting/clearing
> > the on-disk flag....
> 
> With the above change to xfs_diflags_to_iflags() I think we are ok here.
> 
> Ira
> 
--D

> Ira
> 
> > 
> > I think (like Dave said) that if you set XFS_IDONTCACHE on the inode
> > when you change the DAX flag, the VFS will kill the inode the instant
> > the last user close()s the file.  Then 5.c.ii will actually work.
> > 
> > --D
> > 
> > > > 
> > > > > Furthermore, if we did want an interface like that why not allow
> > > > > the on-disk flag to be set as well as cleared?
> > > > 
> > > > Well, why not - it's why I implemented the flag in the first place!
> > > > The only problem we have here is how to safely change the in-memory
> > > > DAX state, and that largely has nothing to do with setting/clearing
> > > > the on-disk flag....
> > > 
> > > With the above change to xfs_diflags_to_iflags() I think we are ok here.
> > > 
> > > Ira
> > >
Jan Kara April 9, 2020, 5:17 p.m. UTC | #17
On Thu 09-04-20 09:59:27, Darrick J. Wong wrote:
> And today:
> 
>  1. There exists an in-kernel access mode flag S_DAX that is set when
>     file accesses go directly to persistent memory, bypassing the page
>     cache.  Applications must call statx to discover the current S_DAX
>     state (STATX_ATTR_DAX).
> 
>  2. There exists an advisory file inode flag FS_XFLAG_DAX that is
>     inherited from the parent directory FS_XFLAG_DAX inode flag at file
>     creation time.  This advisory flag can be set or cleared at any
>     time, but doing so does not immediately affect the S_DAX state.
> 
>     Unless overridden by mount options (see (3)), if FS_XFLAG_DAX is set
>     and the fs is on pmem then it will enable S_DAX at inode load time;
>     if FS_XFLAG_DAX is not set, it will not enable S_DAX.
> 
>  3. There exists a dax= mount option.
> 
>     "-o dax=never"  means "never set S_DAX, ignore FS_XFLAG_DAX."
> 
>     "-o dax=always" means "always set S_DAX (at least on pmem),
>                     and ignore FS_XFLAG_DAX."
> 
>     "-o dax"        is an alias for "dax=always".
> 
>     "-o dax=inode"  means "follow FS_XFLAG_DAX" and is the default.
> 
>  4. There exists an advisory directory inode flag FS_XFLAG_DAX that can
>     be set or cleared at any time.  The flag state is copied into any
>     files or subdirectories when they are created within that directory.
> 
>  5. Programs that require a specific file access mode (DAX or not DAX)
>     can do one of the following:
> 
>     (a) Create files in directories that the FS_XFLAG_DAX flag set as
>         needed; or
> 
>     (b) Have the administrator set an override via mount option; or
> 
>     (c) Set or clear the file's FS_XFLAG_DAX flag as needed.  Programs
>         must then cause the kernel to evict the inode from memory.  This
>         can be done by:
> 
>         i>  Closing the file and re-opening the file and using statx to
>             see if the fs has changed the S_DAX flag; and
> 
>         ii> If the file still does not have the desired S_DAX access
>             mode, either unmount and remount the filesystem, or close
>             the file and use drop_caches.
> 
>  6. It's not unreasonable that users who want to squeeze every last bit
>     of performance out of the particular rough and tumble bits of their
>     storage also be exposed to the difficulties of what happens when the
>     operating system can't totally virtualize those hardware
>     capabilities.  Your high performance sports car is not a Toyota
>     minivan, as it were.
> 
> Given our overnight discussions, I don't think it'll be difficult to
> hoist XFS_IDONTCACHE to the VFS so that 5.c.i is enough to change the
> S_DAX state if nobody else is using the file.

I still find the "S_DAX changes on inode eviction" confusing but I
guess it's as good as it gets and with XFS_IDONTCACHE lifted to VFS, 
it seems acceptable so OK from me.

								Honza
Ira Weiny April 9, 2020, 8:49 p.m. UTC | #18
On Thu, Apr 09, 2020 at 02:41:27PM +0200, Christoph Hellwig wrote:
> On Wed, Apr 08, 2020 at 05:22:03PM -0700, Ira Weiny wrote:
> > > You mean something like XFS_IDONTCACHE?
> > > 
> > > i.e. the functionality already exists in XFS to selectively evict an
> > > inode from cache when the last reference to it is dropped rather
> > > than let it go to the LRUs and hang around in memory.
> > > 
> > > That flag can be set when changing the on disk DAX flag, and we can
> > > tweak how it works so new cache hits don't clear it (as happens
> > > now). Hence the only thing that can prevent eviction are active
> > > references.
> > > 
> > > That means we'll still need to stop the application and drop_caches,
> > > because we need to close all the files and purge the dentries that
> > > hold references to the inode before it can be evicted.
> > 
> > That sounds like a great idea...
> > 
> > Jan?  Christoph?
> 
> Sounds ok.  Note that we could also pretty trivially lift
> XFS_IDONTCACHE to the VFS if we need to apply the same scheme to
> ext4.

Yes I have been slowing working on ext4 in the background.  So lifting
XXX_IDONTCACHE to VFS will be part of that series.

Thanks,
Ira
Ira Weiny April 9, 2020, 8:54 p.m. UTC | #19
On Thu, Apr 09, 2020 at 09:59:27AM -0700, Darrick J. Wong wrote:

[snip]
 
> And today:
> 
>  1. There exists an in-kernel access mode flag S_DAX that is set when
>     file accesses go directly to persistent memory, bypassing the page
>     cache.  Applications must call statx to discover the current S_DAX
>     state (STATX_ATTR_DAX).
> 
>  2. There exists an advisory file inode flag FS_XFLAG_DAX that is
>     inherited from the parent directory FS_XFLAG_DAX inode flag at file
>     creation time.  This advisory flag can be set or cleared at any
>     time, but doing so does not immediately affect the S_DAX state.
> 
>     Unless overridden by mount options (see (3)), if FS_XFLAG_DAX is set
>     and the fs is on pmem then it will enable S_DAX at inode load time;
>     if FS_XFLAG_DAX is not set, it will not enable S_DAX.
> 
>  3. There exists a dax= mount option.
> 
>     "-o dax=never"  means "never set S_DAX, ignore FS_XFLAG_DAX."
> 
>     "-o dax=always" means "always set S_DAX (at least on pmem),
>                     and ignore FS_XFLAG_DAX."
> 
>     "-o dax"        is an alias for "dax=always".
> 
>     "-o dax=inode"  means "follow FS_XFLAG_DAX" and is the default.
> 
>  4. There exists an advisory directory inode flag FS_XFLAG_DAX that can
>     be set or cleared at any time.  The flag state is copied into any
>     files or subdirectories when they are created within that directory.
> 
>  5. Programs that require a specific file access mode (DAX or not DAX)
>     can do one of the following:
> 
>     (a) Create files in directories that the FS_XFLAG_DAX flag set as
>         needed; or
> 
>     (b) Have the administrator set an override via mount option; or
> 
>     (c) Set or clear the file's FS_XFLAG_DAX flag as needed.  Programs
>         must then cause the kernel to evict the inode from memory.  This
>         can be done by:
> 
>         i>  Closing the file and re-opening the file and using statx to
>             see if the fs has changed the S_DAX flag; and
> 
>         ii> If the file still does not have the desired S_DAX access
>             mode, either unmount and remount the filesystem, or close
>             the file and use drop_caches.
> 
>  6. It's not unreasonable that users who want to squeeze every last bit
>     of performance out of the particular rough and tumble bits of their
>     storage also be exposed to the difficulties of what happens when the
>     operating system can't totally virtualize those hardware
>     capabilities.  Your high performance sports car is not a Toyota
>     minivan, as it were.
> 
> Given our overnight discussions, I don't think it'll be difficult to
> hoist XFS_IDONTCACHE to the VFS so that 5.c.i is enough to change the
> S_DAX state if nobody else is using the file.

Agreed!

One note on implementation, I plan to get XFS_IDONTCACHE tested with XFS and
leave hoisting it to VFS for the series which enables ext4 as that is when we
would need such hoisting.

Thanks everyone for another round of discussions!  :-D

Ira
Dave Chinner April 10, 2020, 12:27 a.m. UTC | #20
On Thu, Apr 09, 2020 at 02:40:31PM +0200, Christoph Hellwig wrote:
> On Thu, Apr 09, 2020 at 10:49:21AM +1000, Dave Chinner wrote:
> > > Christoph did say:
> > > 
> > > 	"A reasonably smart application can try to evict itself."
> > > 
> > > 	-- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/
> > 
> > I'd love to know how an unprivileged application can force the
> > eviction of an inode from cache.
> 
> Where did the "unprivileged" suddenly come from?

I'm assuming that applications are being run without the root
permissions needed to run drop_caches. i.e. the apps are
unprivileged, and therefore can't brute force inode cache eviction.
That's why I'm asking what mechanism these applications are using to
evict inodes on demand without requiring elevated privileges,
because I can't see how they'd acheive this...

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 492e53992fa9..e76ed9ca17f7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -466,6 +466,7 @@  int	xfs_break_layouts(struct inode *inode, uint *iolock,
 /* from xfs_iops.c */
 extern void xfs_setup_inode(struct xfs_inode *ip);
 extern void xfs_setup_iops(struct xfs_inode *ip);
+extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
 
 /*
  * When setting up a newly allocated inode, we need to call
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d42de92cb283..c6cd92ef4a05 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1100,37 +1100,6 @@  xfs_flags2diflags2(
 	return di_flags2;
 }
 
-STATIC void
-xfs_diflags_to_linux(
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-	unsigned int		xflags = xfs_ip2xflags(ip);
-
-	if (xflags & FS_XFLAG_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
-	else
-		inode->i_flags &= ~S_IMMUTABLE;
-	if (xflags & FS_XFLAG_APPEND)
-		inode->i_flags |= S_APPEND;
-	else
-		inode->i_flags &= ~S_APPEND;
-	if (xflags & FS_XFLAG_SYNC)
-		inode->i_flags |= S_SYNC;
-	else
-		inode->i_flags &= ~S_SYNC;
-	if (xflags & FS_XFLAG_NOATIME)
-		inode->i_flags |= S_NOATIME;
-	else
-		inode->i_flags &= ~S_NOATIME;
-#if 0	/* disabled until the flag switching races are sorted out */
-	if (xflags & FS_XFLAG_DAX)
-		inode->i_flags |= S_DAX;
-	else
-		inode->i_flags &= ~S_DAX;
-#endif
-}
-
 static int
 xfs_ioctl_setattr_xflags(
 	struct xfs_trans	*tp,
@@ -1168,7 +1137,7 @@  xfs_ioctl_setattr_xflags(
 	ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
 	ip->i_d.di_flags2 = di_flags2;
 
-	xfs_diflags_to_linux(ip);
+	xfs_diflags_to_iflags(ip, false);
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	XFS_STATS_INC(mp, xs_ig_attrchg);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e07f7b641226..a4ac8568c8c7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1259,7 +1259,7 @@  xfs_inode_supports_dax(
 	return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
 }
 
-STATIC bool
+static bool
 xfs_inode_enable_dax(
 	struct xfs_inode *ip)
 {
@@ -1272,26 +1272,38 @@  xfs_inode_enable_dax(
 	return false;
 }
 
-STATIC void
+void
 xfs_diflags_to_iflags(
-	struct inode		*inode,
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	bool init)
 {
-	uint16_t		flags = ip->i_d.di_flags;
-
-	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
-			    S_NOATIME | S_DAX);
+	struct inode		*inode = VFS_I(ip);
+	uint			diflags = xfs_ip2xflags(ip);
 
-	if (flags & XFS_DIFLAG_IMMUTABLE)
+	if (diflags & FS_XFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
-	if (flags & XFS_DIFLAG_APPEND)
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (diflags & FS_XFLAG_APPEND)
 		inode->i_flags |= S_APPEND;
-	if (flags & XFS_DIFLAG_SYNC)
+	else
+		inode->i_flags &= ~S_APPEND;
+	if (diflags & FS_XFLAG_SYNC)
 		inode->i_flags |= S_SYNC;
-	if (flags & XFS_DIFLAG_NOATIME)
+	else
+		inode->i_flags &= ~S_SYNC;
+	if (diflags & FS_XFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
-	if (xfs_inode_enable_dax(ip))
-		inode->i_flags |= S_DAX;
+	else
+		inode->i_flags &= ~S_NOATIME;
+
+	/* Only toggle the dax flag when initializing */
+	if (init) {
+		if (xfs_inode_enable_dax(ip))
+			inode->i_flags |= S_DAX;
+		else
+			inode->i_flags &= ~S_DAX;
+	}
 }
 
 /*
@@ -1320,7 +1332,7 @@  xfs_setup_inode(
 	inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
 
 	i_size_write(inode, ip->i_d.di_size);
-	xfs_diflags_to_iflags(inode, ip);
+	xfs_diflags_to_iflags(ip, true);
 
 	if (S_ISDIR(inode->i_mode)) {
 		/*