Message ID | 20200407182958.568475-7-ira.weiny@intel.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | Enable per-file/per-directory DAX operations V6 | expand |
On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote: > From: Ira Weiny <ira.weiny@intel.com> > > The functionality in xfs_diflags_to_linux() and xfs_diflags_to_iflags() are > nearly identical. The only difference is that *_to_linux() is called after > inode setup and disallows changing the DAX flag. > > Combining them can be done with a flag which indicates if this is the initial > setup to allow the DAX flag to be properly set only at init time. > > So remove xfs_diflags_to_linux() and call the modified xfs_diflags_to_iflags() > directly. > > While we are here simplify xfs_diflags_to_iflags() to take struct xfs_inode and > use xfs_ip2xflags() to ensure future diflags are included correctly. > > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > > --- > Changes from V5: > The functions are no longer identical so we can only combine > them rather than deleting one completely. This is reflected in > the new init parameter. > --- > fs/xfs/xfs_inode.h | 1 + > fs/xfs/xfs_ioctl.c | 33 +-------------------------------- > fs/xfs/xfs_iops.c | 42 +++++++++++++++++++++++++++--------------- > 3 files changed, 29 insertions(+), 47 deletions(-) > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 492e53992fa9..e76ed9ca17f7 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -466,6 +466,7 @@ int xfs_break_layouts(struct inode *inode, uint *iolock, > /* from xfs_iops.c */ > extern void xfs_setup_inode(struct xfs_inode *ip); > extern void xfs_setup_iops(struct xfs_inode *ip); > +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); > > /* > * When setting up a newly allocated inode, we need to call > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > index d42de92cb283..c6cd92ef4a05 100644 > --- a/fs/xfs/xfs_ioctl.c > +++ b/fs/xfs/xfs_ioctl.c > @@ -1100,37 +1100,6 @@ xfs_flags2diflags2( > return di_flags2; > } > > -STATIC void > -xfs_diflags_to_linux( > - struct xfs_inode *ip) > -{ > - struct inode *inode = VFS_I(ip); > - unsigned int xflags = xfs_ip2xflags(ip); > - > - if (xflags & FS_XFLAG_IMMUTABLE) > - inode->i_flags |= S_IMMUTABLE; > - else > - inode->i_flags &= ~S_IMMUTABLE; > - if (xflags & FS_XFLAG_APPEND) > - inode->i_flags |= S_APPEND; > - else > - inode->i_flags &= ~S_APPEND; > - if (xflags & FS_XFLAG_SYNC) > - inode->i_flags |= S_SYNC; > - else > - inode->i_flags &= ~S_SYNC; > - if (xflags & FS_XFLAG_NOATIME) > - inode->i_flags |= S_NOATIME; > - else > - inode->i_flags &= ~S_NOATIME; > -#if 0 /* disabled until the flag switching races are sorted out */ > - if (xflags & FS_XFLAG_DAX) > - inode->i_flags |= S_DAX; > - else > - inode->i_flags &= ~S_DAX; > -#endif So this variant will set the flag in the inode if the disk inode flag is set, otherwise it will clear it. It does it with if/else branches. > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > index e07f7b641226..a4ac8568c8c7 100644 > --- a/fs/xfs/xfs_iops.c > +++ b/fs/xfs/xfs_iops.c > @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax( > return xfs_inode_buftarg(ip)->bt_daxdev != NULL; > } > > -STATIC bool > +static bool > xfs_inode_enable_dax( > struct xfs_inode *ip) > { This belongs in the previous patch. > @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax( > return false; > } > > -STATIC void > +void > xfs_diflags_to_iflags( > - struct inode *inode, > - struct xfs_inode *ip) > + struct xfs_inode *ip, > + bool init) > { > - uint16_t flags = ip->i_d.di_flags; > - > - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | > - S_NOATIME | S_DAX); And this code cleared all the flags in the inode first, then set them if the disk inode flag is set. This does not require branches, resulting in more readable code and better code generation. > + struct inode *inode = VFS_I(ip); > + uint diflags = xfs_ip2xflags(ip); > > - if (flags & XFS_DIFLAG_IMMUTABLE) > + if (diflags & FS_XFLAG_IMMUTABLE) > inode->i_flags |= S_IMMUTABLE; > - if (flags & XFS_DIFLAG_APPEND) > + else > + inode->i_flags &= ~S_IMMUTABLE; > + if (diflags & FS_XFLAG_APPEND) > inode->i_flags |= S_APPEND; > - if (flags & XFS_DIFLAG_SYNC) > + else > + inode->i_flags &= ~S_APPEND; > + if (diflags & FS_XFLAG_SYNC) > inode->i_flags |= S_SYNC; > - if (flags & XFS_DIFLAG_NOATIME) > + else > + inode->i_flags &= ~S_SYNC; > + if (diflags & FS_XFLAG_NOATIME) > inode->i_flags |= S_NOATIME; > - if (xfs_inode_enable_dax(ip)) > - inode->i_flags |= S_DAX; > + else > + inode->i_flags &= ~S_NOATIME; > + > + /* Only toggle the dax flag when initializing */ > + if (init) { > + if (xfs_inode_enable_dax(ip)) > + inode->i_flags |= S_DAX; > + else > + inode->i_flags &= ~S_DAX; > + } > } IOWs, this: struct inode *inode = VFS_I(ip); unsigned int xflags = xfs_ip2xflags(ip); unsigned int flags = 0; if (xflags & FS_XFLAG_IMMUTABLE) flags |= S_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) flags |= S_APPEND; if (xflags & FS_XFLAG_SYNC) flags |= S_SYNC; if (xflags & FS_XFLAG_NOATIME) flags |= S_NOATIME; if ((xflags & FS_XFLAG_DAX) && init) flags |= S_DAX; inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); inode->i_flags |= flags; ends up being much easier to read and results in better code generation. And we don't need to clear the S_DAX flag when "init" is set, because we are starting from an inode that has no flags set (because init!)... Cheers, Dave.
On Wed, Apr 08, 2020 at 12:08:27PM +1000, Dave Chinner wrote: > On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote: > > From: Ira Weiny <ira.weiny@intel.com> [snip] > > > > -STATIC void > > -xfs_diflags_to_linux( > > - struct xfs_inode *ip) > > -{ > > - struct inode *inode = VFS_I(ip); > > - unsigned int xflags = xfs_ip2xflags(ip); > > - > > - if (xflags & FS_XFLAG_IMMUTABLE) > > - inode->i_flags |= S_IMMUTABLE; > > - else > > - inode->i_flags &= ~S_IMMUTABLE; > > - if (xflags & FS_XFLAG_APPEND) > > - inode->i_flags |= S_APPEND; > > - else > > - inode->i_flags &= ~S_APPEND; > > - if (xflags & FS_XFLAG_SYNC) > > - inode->i_flags |= S_SYNC; > > - else > > - inode->i_flags &= ~S_SYNC; > > - if (xflags & FS_XFLAG_NOATIME) > > - inode->i_flags |= S_NOATIME; > > - else > > - inode->i_flags &= ~S_NOATIME; > > -#if 0 /* disabled until the flag switching races are sorted out */ > > - if (xflags & FS_XFLAG_DAX) > > - inode->i_flags |= S_DAX; > > - else > > - inode->i_flags &= ~S_DAX; > > -#endif > > So this variant will set the flag in the inode if the disk inode > flag is set, otherwise it will clear it. It does it with if/else > branches. > > > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > > index e07f7b641226..a4ac8568c8c7 100644 > > --- a/fs/xfs/xfs_iops.c > > +++ b/fs/xfs/xfs_iops.c > > @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax( > > return xfs_inode_buftarg(ip)->bt_daxdev != NULL; > > } > > > > -STATIC bool > > +static bool > > xfs_inode_enable_dax( > > struct xfs_inode *ip) > > { > > This belongs in the previous patch. Ah yea... Sorry. Fixed in V7 > > > @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax( > > return false; > > } > > > > -STATIC void > > +void > > xfs_diflags_to_iflags( > > - struct inode *inode, > > - struct xfs_inode *ip) > > + struct xfs_inode *ip, > > + bool init) > > { > > - uint16_t flags = ip->i_d.di_flags; > > - > > - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | > > - S_NOATIME | S_DAX); > > And this code cleared all the flags in the inode first, then > set them if the disk inode flag is set. This does not require > branches, resulting in more readable code and better code > generation. > > > + struct inode *inode = VFS_I(ip); > > + uint diflags = xfs_ip2xflags(ip); > > > > - if (flags & XFS_DIFLAG_IMMUTABLE) > > + if (diflags & FS_XFLAG_IMMUTABLE) > > inode->i_flags |= S_IMMUTABLE; > > - if (flags & XFS_DIFLAG_APPEND) > > + else > > + inode->i_flags &= ~S_IMMUTABLE; > > > + if (diflags & FS_XFLAG_APPEND) > > inode->i_flags |= S_APPEND; > > - if (flags & XFS_DIFLAG_SYNC) > > + else > > + inode->i_flags &= ~S_APPEND; > > + if (diflags & FS_XFLAG_SYNC) > > inode->i_flags |= S_SYNC; > > - if (flags & XFS_DIFLAG_NOATIME) > > + else > > + inode->i_flags &= ~S_SYNC; > > + if (diflags & FS_XFLAG_NOATIME) > > inode->i_flags |= S_NOATIME; > > - if (xfs_inode_enable_dax(ip)) > > - inode->i_flags |= S_DAX; > > + else > > + inode->i_flags &= ~S_NOATIME; > > + > > + /* Only toggle the dax flag when initializing */ > > + if (init) { > > + if (xfs_inode_enable_dax(ip)) > > + inode->i_flags |= S_DAX; > > + else > > + inode->i_flags &= ~S_DAX; > > + } > > } > > IOWs, this: > > struct inode *inode = VFS_I(ip); > unsigned int xflags = xfs_ip2xflags(ip); > unsigned int flags = 0; > > if (xflags & FS_XFLAG_IMMUTABLE) > flags |= S_IMMUTABLE; > if (xflags & FS_XFLAG_APPEND) > flags |= S_APPEND; > if (xflags & FS_XFLAG_SYNC) > flags |= S_SYNC; > if (xflags & FS_XFLAG_NOATIME) > flags |= S_NOATIME; > if ((xflags & FS_XFLAG_DAX) && init) > flags |= S_DAX; > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); > inode->i_flags |= flags; > > ends up being much easier to read and results in better code > generation. And we don't need to clear the S_DAX flag when "init" is > set, because we are starting from an inode that has no flags set > (because init!)... This sounds good but I think we need a slight modification to make the function equivalent in functionality. void xfs_diflags_to_iflags( struct xfs_inode *ip, bool init) { struct inode *inode = VFS_I(ip); unsigned int xflags = xfs_ip2xflags(ip); unsigned int flags = 0; inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | S_DAX); if (xflags & FS_XFLAG_IMMUTABLE) flags |= S_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) flags |= S_APPEND; if (xflags & FS_XFLAG_SYNC) flags |= S_SYNC; if (xflags & FS_XFLAG_NOATIME) flags |= S_NOATIME; if (init && xfs_inode_enable_dax(ip)) flags |= S_DAX; inode->i_flags |= flags; } I've queued this for v7. Thanks, Ira > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote: > On Wed, Apr 08, 2020 at 12:08:27PM +1000, Dave Chinner wrote: > > On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote: > > > From: Ira Weiny <ira.weiny@intel.com> > > [snip] > > > > > > > -STATIC void > > > -xfs_diflags_to_linux( > > > - struct xfs_inode *ip) > > > -{ > > > - struct inode *inode = VFS_I(ip); > > > - unsigned int xflags = xfs_ip2xflags(ip); > > > - > > > - if (xflags & FS_XFLAG_IMMUTABLE) > > > - inode->i_flags |= S_IMMUTABLE; > > > - else > > > - inode->i_flags &= ~S_IMMUTABLE; > > > - if (xflags & FS_XFLAG_APPEND) > > > - inode->i_flags |= S_APPEND; > > > - else > > > - inode->i_flags &= ~S_APPEND; > > > - if (xflags & FS_XFLAG_SYNC) > > > - inode->i_flags |= S_SYNC; > > > - else > > > - inode->i_flags &= ~S_SYNC; > > > - if (xflags & FS_XFLAG_NOATIME) > > > - inode->i_flags |= S_NOATIME; > > > - else > > > - inode->i_flags &= ~S_NOATIME; > > > -#if 0 /* disabled until the flag switching races are sorted out */ > > > - if (xflags & FS_XFLAG_DAX) > > > - inode->i_flags |= S_DAX; > > > - else > > > - inode->i_flags &= ~S_DAX; > > > -#endif > > > > So this variant will set the flag in the inode if the disk inode > > flag is set, otherwise it will clear it. It does it with if/else > > branches. > > > > > > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > > > index e07f7b641226..a4ac8568c8c7 100644 > > > --- a/fs/xfs/xfs_iops.c > > > +++ b/fs/xfs/xfs_iops.c > > > @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax( > > > return xfs_inode_buftarg(ip)->bt_daxdev != NULL; > > > } > > > > > > -STATIC bool > > > +static bool > > > xfs_inode_enable_dax( > > > struct xfs_inode *ip) > > > { > > > > This belongs in the previous patch. > > Ah yea... Sorry. > > Fixed in V7 > > > > > > @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax( > > > return false; > > > } > > > > > > -STATIC void > > > +void > > > xfs_diflags_to_iflags( > > > - struct inode *inode, > > > - struct xfs_inode *ip) > > > + struct xfs_inode *ip, > > > + bool init) > > > { > > > - uint16_t flags = ip->i_d.di_flags; > > > - > > > - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | > > > - S_NOATIME | S_DAX); > > > > And this code cleared all the flags in the inode first, then > > set them if the disk inode flag is set. This does not require > > branches, resulting in more readable code and better code > > generation. > > > > > + struct inode *inode = VFS_I(ip); > > > + uint diflags = xfs_ip2xflags(ip); > > > > > > - if (flags & XFS_DIFLAG_IMMUTABLE) > > > + if (diflags & FS_XFLAG_IMMUTABLE) > > > inode->i_flags |= S_IMMUTABLE; > > > - if (flags & XFS_DIFLAG_APPEND) > > > + else > > > + inode->i_flags &= ~S_IMMUTABLE; > > > > > + if (diflags & FS_XFLAG_APPEND) > > > inode->i_flags |= S_APPEND; > > > - if (flags & XFS_DIFLAG_SYNC) > > > + else > > > + inode->i_flags &= ~S_APPEND; > > > + if (diflags & FS_XFLAG_SYNC) > > > inode->i_flags |= S_SYNC; > > > - if (flags & XFS_DIFLAG_NOATIME) > > > + else > > > + inode->i_flags &= ~S_SYNC; > > > + if (diflags & FS_XFLAG_NOATIME) > > > inode->i_flags |= S_NOATIME; > > > - if (xfs_inode_enable_dax(ip)) > > > - inode->i_flags |= S_DAX; > > > + else > > > + inode->i_flags &= ~S_NOATIME; > > > + > > > + /* Only toggle the dax flag when initializing */ > > > + if (init) { > > > + if (xfs_inode_enable_dax(ip)) > > > + inode->i_flags |= S_DAX; > > > + else > > > + inode->i_flags &= ~S_DAX; > > > + } > > > } > > > > IOWs, this: > > > > struct inode *inode = VFS_I(ip); > > unsigned int xflags = xfs_ip2xflags(ip); > > unsigned int flags = 0; > > > > if (xflags & FS_XFLAG_IMMUTABLE) > > flags |= S_IMMUTABLE; > > if (xflags & FS_XFLAG_APPEND) > > flags |= S_APPEND; > > if (xflags & FS_XFLAG_SYNC) > > flags |= S_SYNC; > > if (xflags & FS_XFLAG_NOATIME) > > flags |= S_NOATIME; > > if ((xflags & FS_XFLAG_DAX) && init) > > flags |= S_DAX; > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); > > inode->i_flags |= flags; > > > > ends up being much easier to read and results in better code > > generation. And we don't need to clear the S_DAX flag when "init" is > > set, because we are starting from an inode that has no flags set > > (because init!)... > > This sounds good but I think we need a slight modification to make the function equivalent in functionality. > > void > xfs_diflags_to_iflags( > struct xfs_inode *ip, > bool init) > { > struct inode *inode = VFS_I(ip); > unsigned int xflags = xfs_ip2xflags(ip); > unsigned int flags = 0; > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > S_DAX); We don't want to clear the dax flag here, ever, if it is already set. That is an externally visible change and opens us up (again) to races where IS_DAX() changes half way through a fault path. IOWs, avoiding clearing the DAX flag was something I did explicitly in the above code fragment. And it makes the logic clearer by pre-calculating the new flags, then clearing and setting the inode flags together, rather than having the spearated at the top and bottom of the function. THis leads to an obvious conclusion: if we never clear the in memory S_DAX flag, we can actually clear the on-disk flag safely, so that next time the inode cycles into memory it won't be using DAX. IOWs, admins can stop the applications, clear the DAX flag and drop caches. This should result in the inode being recycled and when the app is restarted it will run without DAX. No ned for deleting files, copying large data sets, etc just to turn off an inode flag. Cheers, Dave.
On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote: > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote: > > On Wed, Apr 08, 2020 at 12:08:27PM +1000, Dave Chinner wrote: > > > On Tue, Apr 07, 2020 at 11:29:56AM -0700, ira.weiny@intel.com wrote: > > > > From: Ira Weiny <ira.weiny@intel.com> > > > > [snip] > > > > > > > > > > -STATIC void > > > > -xfs_diflags_to_linux( > > > > - struct xfs_inode *ip) > > > > -{ > > > > - struct inode *inode = VFS_I(ip); > > > > - unsigned int xflags = xfs_ip2xflags(ip); > > > > - > > > > - if (xflags & FS_XFLAG_IMMUTABLE) > > > > - inode->i_flags |= S_IMMUTABLE; > > > > - else > > > > - inode->i_flags &= ~S_IMMUTABLE; > > > > - if (xflags & FS_XFLAG_APPEND) > > > > - inode->i_flags |= S_APPEND; > > > > - else > > > > - inode->i_flags &= ~S_APPEND; > > > > - if (xflags & FS_XFLAG_SYNC) > > > > - inode->i_flags |= S_SYNC; > > > > - else > > > > - inode->i_flags &= ~S_SYNC; > > > > - if (xflags & FS_XFLAG_NOATIME) > > > > - inode->i_flags |= S_NOATIME; > > > > - else > > > > - inode->i_flags &= ~S_NOATIME; > > > > -#if 0 /* disabled until the flag switching races are sorted out */ > > > > - if (xflags & FS_XFLAG_DAX) > > > > - inode->i_flags |= S_DAX; > > > > - else > > > > - inode->i_flags &= ~S_DAX; > > > > -#endif > > > > > > So this variant will set the flag in the inode if the disk inode > > > flag is set, otherwise it will clear it. It does it with if/else > > > branches. > > > > > > > > > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > > > > index e07f7b641226..a4ac8568c8c7 100644 > > > > --- a/fs/xfs/xfs_iops.c > > > > +++ b/fs/xfs/xfs_iops.c > > > > @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax( > > > > return xfs_inode_buftarg(ip)->bt_daxdev != NULL; > > > > } > > > > > > > > -STATIC bool > > > > +static bool > > > > xfs_inode_enable_dax( > > > > struct xfs_inode *ip) > > > > { > > > > > > This belongs in the previous patch. > > > > Ah yea... Sorry. > > > > Fixed in V7 > > > > > > > > > @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax( > > > > return false; > > > > } > > > > > > > > -STATIC void > > > > +void > > > > xfs_diflags_to_iflags( > > > > - struct inode *inode, > > > > - struct xfs_inode *ip) > > > > + struct xfs_inode *ip, > > > > + bool init) > > > > { > > > > - uint16_t flags = ip->i_d.di_flags; > > > > - > > > > - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | > > > > - S_NOATIME | S_DAX); > > > > > > And this code cleared all the flags in the inode first, then > > > set them if the disk inode flag is set. This does not require > > > branches, resulting in more readable code and better code > > > generation. > > > > > > > + struct inode *inode = VFS_I(ip); > > > > + uint diflags = xfs_ip2xflags(ip); > > > > > > > > - if (flags & XFS_DIFLAG_IMMUTABLE) > > > > + if (diflags & FS_XFLAG_IMMUTABLE) > > > > inode->i_flags |= S_IMMUTABLE; > > > > - if (flags & XFS_DIFLAG_APPEND) > > > > + else > > > > + inode->i_flags &= ~S_IMMUTABLE; > > > > > > > + if (diflags & FS_XFLAG_APPEND) > > > > inode->i_flags |= S_APPEND; > > > > - if (flags & XFS_DIFLAG_SYNC) > > > > + else > > > > + inode->i_flags &= ~S_APPEND; > > > > + if (diflags & FS_XFLAG_SYNC) > > > > inode->i_flags |= S_SYNC; > > > > - if (flags & XFS_DIFLAG_NOATIME) > > > > + else > > > > + inode->i_flags &= ~S_SYNC; > > > > + if (diflags & FS_XFLAG_NOATIME) > > > > inode->i_flags |= S_NOATIME; > > > > - if (xfs_inode_enable_dax(ip)) > > > > - inode->i_flags |= S_DAX; > > > > + else > > > > + inode->i_flags &= ~S_NOATIME; > > > > + > > > > + /* Only toggle the dax flag when initializing */ > > > > + if (init) { > > > > + if (xfs_inode_enable_dax(ip)) > > > > + inode->i_flags |= S_DAX; > > > > + else > > > > + inode->i_flags &= ~S_DAX; > > > > + } > > > > } > > > > > > IOWs, this: > > > > > > struct inode *inode = VFS_I(ip); > > > unsigned int xflags = xfs_ip2xflags(ip); > > > unsigned int flags = 0; > > > > > > if (xflags & FS_XFLAG_IMMUTABLE) > > > flags |= S_IMMUTABLE; > > > if (xflags & FS_XFLAG_APPEND) > > > flags |= S_APPEND; > > > if (xflags & FS_XFLAG_SYNC) > > > flags |= S_SYNC; > > > if (xflags & FS_XFLAG_NOATIME) > > > flags |= S_NOATIME; > > > if ((xflags & FS_XFLAG_DAX) && init) > > > flags |= S_DAX; > > > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); > > > inode->i_flags |= flags; > > > > > > ends up being much easier to read and results in better code > > > generation. And we don't need to clear the S_DAX flag when "init" is > > > set, because we are starting from an inode that has no flags set > > > (because init!)... > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality. > > > > void > > xfs_diflags_to_iflags( > > struct xfs_inode *ip, > > bool init) > > { > > struct inode *inode = VFS_I(ip); > > unsigned int xflags = xfs_ip2xflags(ip); > > unsigned int flags = 0; > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > > S_DAX); > > We don't want to clear the dax flag here, ever, if it is already > set. That is an externally visible change and opens us up (again) to > races where IS_DAX() changes half way through a fault path. IOWs, avoiding > clearing the DAX flag was something I did explicitly in the above > code fragment. > > And it makes the logic clearer by pre-calculating the new flags, > then clearing and setting the inode flags together, rather than > having the spearated at the top and bottom of the function. > > THis leads to an obvious conclusion: if we never clear the in memory > S_DAX flag, we can actually clear the on-disk flag safely, so that > next time the inode cycles into memory it won't be using DAX. IOWs, > admins can stop the applications, clear the DAX flag and drop > caches. This should result in the inode being recycled and when the > app is restarted it will run without DAX. No ned for deleting files, > copying large data sets, etc just to turn off an inode flag. Makes sense, but is that sufficient? I recall you saying there might be a multitude of other reasons that the inode is not evicted, not the least of which is races [1]. Does this need another flag, lets call it "dax toggle" to track the "I requested the inode to clear the flag, but on cache-flush + restart the inode never got evicted" case. S_DAX almost plays this role, but it loses the ability to audit which files are pending an inode eviction event. So the dax-toggle flag indicates to the kernel to xor the toggle value with the inode flag on inode instantiation and the dax inode flag is never directly manipulated by the ioctl path. [1]: http://lore.kernel.org/r/20191025003603.GE4614@dread.disaster.area
On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote: > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote: [snip] > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality. > > > > void > > xfs_diflags_to_iflags( > > struct xfs_inode *ip, > > bool init) > > { > > struct inode *inode = VFS_I(ip); > > unsigned int xflags = xfs_ip2xflags(ip); > > unsigned int flags = 0; > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > > S_DAX); > > We don't want to clear the dax flag here, ever, if it is already > set. That is an externally visible change and opens us up (again) to > races where IS_DAX() changes half way through a fault path. IOWs, avoiding > clearing the DAX flag was something I did explicitly in the above > code fragment. <sigh> yes... you are correct. But I don't like depending on the caller to clear the S_DAX flag if xfs_inode_enable_dax() is false. IMO this function should clear the flag in that case for consistency... This is part of the reason I used the if/else logic from xfs_diflags_to_linux() originally. It is very explicit. > > And it makes the logic clearer by pre-calculating the new flags, > then clearing and setting the inode flags together, rather than > having the spearated at the top and bottom of the function. But this will not clear the S_DAX flag even if init is true. To me that is a potential for confusion down the road. > > THis leads to an obvious conclusion: if we never clear the in memory > S_DAX flag, we can actually clear the on-disk flag safely, so that > next time the inode cycles into memory it won't be using DAX. IOWs, > admins can stop the applications, clear the DAX flag and drop > caches. This should result in the inode being recycled and when the > app is restarted it will run without DAX. No ned for deleting files, > copying large data sets, etc just to turn off an inode flag. We already discussed evicting the inode and it was determined to be too confusing.[*] Furthermore, if we did want an interface like that why not allow the on-disk flag to be set as well as cleared? IMO, this function should set all of the flags consistently including S_DAX. Ira [*] https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/ > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
On Wed, Apr 08, 2020 at 02:28:30PM -0700, Dan Williams wrote: > On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote: > > [snip] > > > > > > void > > > xfs_diflags_to_iflags( > > > struct xfs_inode *ip, > > > bool init) > > > { > > > struct inode *inode = VFS_I(ip); > > > unsigned int xflags = xfs_ip2xflags(ip); > > > unsigned int flags = 0; > > > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > > > S_DAX); > > > > We don't want to clear the dax flag here, ever, if it is already > > set. That is an externally visible change and opens us up (again) to > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding > > clearing the DAX flag was something I did explicitly in the above > > code fragment. > > > > And it makes the logic clearer by pre-calculating the new flags, > > then clearing and setting the inode flags together, rather than > > having the spearated at the top and bottom of the function. > > > > THis leads to an obvious conclusion: if we never clear the in memory > > S_DAX flag, we can actually clear the on-disk flag safely, so that > > next time the inode cycles into memory it won't be using DAX. IOWs, > > admins can stop the applications, clear the DAX flag and drop > > caches. This should result in the inode being recycled and when the > > app is restarted it will run without DAX. No ned for deleting files, > > copying large data sets, etc just to turn off an inode flag. > > Makes sense, but is that sufficient? I recall you saying there might > be a multitude of other reasons that the inode is not evicted, not the > least of which is races [1]. Does this need another flag, lets call it > "dax toggle" to track the "I requested the inode to clear the flag, > but on cache-flush + restart the inode never got evicted" case. S_DAX > almost plays this role, but it loses the ability to audit which files > are pending an inode eviction event. So the dax-toggle flag indicates > to the kernel to xor the toggle value with the inode flag on inode > instantiation and the dax inode flag is never directly manipulated by > the ioctl path. > > [1]: http://lore.kernel.org/r/20191025003603.GE4614@dread.disaster.area FWIW I think we should continue down this simplified interface and get this done for 5.8. If we can come up with a way for delayed mode change I'm all for looking into that. But there has been too much controversy/difficulty about changing the bit on a file. So let's table this idea until >= 5.9 Ira
On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote: > On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote: > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote: > > [snip] > > > > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality. > > > > > > void > > > xfs_diflags_to_iflags( > > > struct xfs_inode *ip, > > > bool init) > > > { > > > struct inode *inode = VFS_I(ip); > > > unsigned int xflags = xfs_ip2xflags(ip); > > > unsigned int flags = 0; > > > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > > > S_DAX); > > > > We don't want to clear the dax flag here, ever, if it is already > > set. That is an externally visible change and opens us up (again) to > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding > > clearing the DAX flag was something I did explicitly in the above > > code fragment. > > <sigh> yes... you are correct. > > But I don't like depending on the caller to clear the S_DAX flag if > xfs_inode_enable_dax() is false. IMO this function should clear the flag in > that case for consistency... No. We simply cannot do that here except in the init case when the inode is not yet visible to userspace. In which case, we know -for certain- that the S_DAX is not set, and hence we do not need to clear it. Initial conditions matter! If you want to make sure of this, add this: ASSERT(!(IS_DAX(inode) && init)); And now we'll catch inodes that incorrectly have S_DAX set at init time. > > memory S_DAX flag, we can actually clear the on-disk flag > > safely, so that next time the inode cycles into memory it won't > > be using DAX. IOWs, admins can stop the applications, clear the > > DAX flag and drop caches. This should result in the inode being > > recycled and when the app is restarted it will run without DAX. > > No ned for deleting files, copying large data sets, etc just to > > turn off an inode flag. > > We already discussed evicting the inode and it was determined to > be too confusing.[*] That discussion did not even consider how admins are supposed to clear the inode flag once it is set on disk. It was entirely focussed around "we can't change in memory S_DAX state" and how the tri-state mount option to "override" the on-disk flag could be done. Nobody noticed that being unable to rmeove the on-disk flag means the admin's only option to turn off dax for an application is to turn it off for everything, filesystem wide, which requires: 1. stopping the app. 2. stopping every other app using the filesystem 3. unmounting the filesystem 4. changing to dax=never mount option 5. mounting the filesystem 6. restarting all apps. It's a hard stop for everything using the filesystem, and it changes the runtime environment for all applications, not just the one that needs DAX turned off. Not to mention that if it's the root filesystem that is using DAX, then it's a full system reboot needed to change the mount options. IMO, this is a non-starter from a production point of view - testing and qualification of all applications rather than just the affected app is required to make this sort of change. It simply does not follow the "minimal change to fix the problem" rules for managing issues in production environments. So, pLease explain to me how this process: 1. stop the app 2. remove inode flags via xfs_io 3. run drop_caches 4. start the app is worse than requiring admins to unmount the filesystem to turn off DAX for an application. > Furthermore, if we did want an interface like that why not allow > the on-disk flag to be set as well as cleared? Well, why not - it's why I implemented the flag in the first place! The only problem we have here is how to safely change the in-memory DAX state, and that largely has nothing to do with setting/clearing the on-disk flag.... Cheers, Dave.
On Wed, Apr 08, 2020 at 02:28:30PM -0700, Dan Williams wrote: > On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote: > > THis leads to an obvious conclusion: if we never clear the in memory > > S_DAX flag, we can actually clear the on-disk flag safely, so that > > next time the inode cycles into memory it won't be using DAX. IOWs, > > admins can stop the applications, clear the DAX flag and drop > > caches. This should result in the inode being recycled and when the > > app is restarted it will run without DAX. No ned for deleting files, > > copying large data sets, etc just to turn off an inode flag. > > Makes sense, but is that sufficient? I recall you saying there might > be a multitude of other reasons that the inode is not evicted, not the > least of which is races [1]. Does this need another flag, lets call it > "dax toggle" to track the "I requested the inode to clear the flag, > but on cache-flush + restart the inode never got evicted" case. You mean something like XFS_IDONTCACHE? i.e. the functionality already exists in XFS to selectively evict an inode from cache when the last reference to it is dropped rather than let it go to the LRUs and hang around in memory. That flag can be set when changing the on disk DAX flag, and we can tweak how it works so new cache hits don't clear it (as happens now). Hence the only thing that can prevent eviction are active references. That means we'll still need to stop the application and drop_caches, because we need to close all the files and purge the dentries that hold references to the inode before it can be evicted. Cheers, Dave.
On Thu, Apr 09, 2020 at 09:21:06AM +1000, Dave Chinner wrote: > On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote: > > On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote: > > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote: > > > > [snip] > > > > > > > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality. > > > > > > > > void > > > > xfs_diflags_to_iflags( > > > > struct xfs_inode *ip, > > > > bool init) > > > > { > > > > struct inode *inode = VFS_I(ip); > > > > unsigned int xflags = xfs_ip2xflags(ip); > > > > unsigned int flags = 0; > > > > > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > > > > S_DAX); > > > > > > We don't want to clear the dax flag here, ever, if it is already > > > set. That is an externally visible change and opens us up (again) to > > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding > > > clearing the DAX flag was something I did explicitly in the above > > > code fragment. > > > > <sigh> yes... you are correct. > > > > But I don't like depending on the caller to clear the S_DAX flag if > > xfs_inode_enable_dax() is false. IMO this function should clear the flag in > > that case for consistency... > > No. We simply cannot do that here except in the init case when the > inode is not yet visible to userspace. In which case, we know -for > certain- that the S_DAX is not set, and hence we do not need to > clear it. Initial conditions matter! > > If you want to make sure of this, add this: > > ASSERT(!(IS_DAX(inode) && init)); > > And now we'll catch inodes that incorrectly have S_DAX set at init > time. Ok, that will work. Also documents that expected initial condition. > > > > memory S_DAX flag, we can actually clear the on-disk flag > > > safely, so that next time the inode cycles into memory it won't > > > be using DAX. IOWs, admins can stop the applications, clear the > > > DAX flag and drop caches. This should result in the inode being > > > recycled and when the app is restarted it will run without DAX. > > > No ned for deleting files, copying large data sets, etc just to > > > turn off an inode flag. > > > > We already discussed evicting the inode and it was determined to > > be too confusing.[*] > > That discussion did not even consider how admins are supposed to > clear the inode flag once it is set on disk. I think this is a bit unfair. I think we were all considering it... and I still think this patch set is a step in the right direction. > It was entirely > focussed around "we can't change in memory S_DAX state" Not true. There were many ideas on how to change the FS_XFLAG_DAX with some sort of delayed S_DAX state to avoid changing S_DAX on an in memory inode. I made the comment: "... I want to clarify. ... we could have the flag change with an appropriate error which could let the user know the change has been delayed." -- https://lore.kernel.org/lkml/20200402205518.GF3952565@iweiny-DESK2.sc.intel.com/ Jan made multiple comments: "I generally like the proposal but I think the fact that toggling FS_XFLAG_DAX will not have immediate effect on S_DAX will cause quite some confusion and ultimately bug reports." -- https://lore.kernel.org/lkml/20200401102511.GC19466@quack2.suse.cz/ "Just switch FS_XFLAG_DAX flag, S_DAX flag will magically switch when inode gets evicted and the inode gets reloaded from the disk again. Did I misunderstand anything? And my thinking was that this is surprising behavior for the user and so it will likely generate lots of bug reports along the lines of "DAX inode flag does not work!"." -- https://lore.kernel.org/lkml/20200403170338.GD29920@quack2.suse.cz/ Darrick also had similar ideas/comments. Christoph did say: "A reasonably smart application can try to evict itself." -- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/ Which I was unclear about??? Christoph does this mean you would be ok with changing the FS_XFLAG_DAX on disk and letting S_DAX change later? > and how the > tri-state mount option to "override" the on-disk flag could be done. > > Nobody noticed that being unable to rmeove the on-disk flag means > the admin's only option to turn off dax for an application is to > turn it off for everything, filesystem wide, which requires: No. This is not entirely true. While I don't like the idea of having to copy data (and I agree with your points) it is possible to do. > > 1. stopping the app. > 2. stopping every other app using the filesystem > 3. unmounting the filesystem > 4. changing to dax=never mount option I don't understand why we need to unmount and mount with dax=never? > 5. mounting the filesystem > 6. restarting all apps. > > It's a hard stop for everything using the filesystem, and it changes > the runtime environment for all applications, not just the one that > needs DAX turned off. Not to mention that if it's the root > filesystem that is using DAX, then it's a full system reboot needed > to change the mount options. > > IMO, this is a non-starter from a production point of view - testing > and qualification of all applications rather than just the affected > app is required to make this sort of change. It simply does not > follow the "minimal change to fix the problem" rules for managing > issues in production environments. > > So, pLease explain to me how this process: > > 1. stop the app > 2. remove inode flags via xfs_io > 3. run drop_caches > 4. start the app > > is worse than requiring admins to unmount the filesystem to turn off > DAX for an application. Jan? Christoph? > > > Furthermore, if we did want an interface like that why not allow > > the on-disk flag to be set as well as cleared? > > Well, why not - it's why I implemented the flag in the first place! > The only problem we have here is how to safely change the in-memory > DAX state, and that largely has nothing to do with setting/clearing > the on-disk flag.... With the above change to xfs_diflags_to_iflags() I think we are ok here. Ira
On Thu, Apr 09, 2020 at 09:58:36AM +1000, Dave Chinner wrote: > On Wed, Apr 08, 2020 at 02:28:30PM -0700, Dan Williams wrote: > > On Wed, Apr 8, 2020 at 2:02 PM Dave Chinner <david@fromorbit.com> wrote: > > > THis leads to an obvious conclusion: if we never clear the in memory > > > S_DAX flag, we can actually clear the on-disk flag safely, so that > > > next time the inode cycles into memory it won't be using DAX. IOWs, > > > admins can stop the applications, clear the DAX flag and drop > > > caches. This should result in the inode being recycled and when the > > > app is restarted it will run without DAX. No ned for deleting files, > > > copying large data sets, etc just to turn off an inode flag. > > > > Makes sense, but is that sufficient? I recall you saying there might > > be a multitude of other reasons that the inode is not evicted, not the > > least of which is races [1]. Does this need another flag, lets call it > > "dax toggle" to track the "I requested the inode to clear the flag, > > but on cache-flush + restart the inode never got evicted" case. > > You mean something like XFS_IDONTCACHE? > > i.e. the functionality already exists in XFS to selectively evict an > inode from cache when the last reference to it is dropped rather > than let it go to the LRUs and hang around in memory. > > That flag can be set when changing the on disk DAX flag, and we can > tweak how it works so new cache hits don't clear it (as happens > now). Hence the only thing that can prevent eviction are active > references. > > That means we'll still need to stop the application and drop_caches, > because we need to close all the files and purge the dentries that > hold references to the inode before it can be evicted. That sounds like a great idea... Jan? Christoph? I will reiterate though that any delayed S_DAX change be done as a follow on series to the one proposed here. Ira
On Wed, Apr 08, 2020 at 05:12:06PM -0700, Ira Weiny wrote: > On Thu, Apr 09, 2020 at 09:21:06AM +1000, Dave Chinner wrote: > > On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote: > > > On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote: > > > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote: > > > > > > [snip] > > > > > > > > > > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality. > > > > > > > > > > void > > > > > xfs_diflags_to_iflags( > > > > > struct xfs_inode *ip, > > > > > bool init) > > > > > { > > > > > struct inode *inode = VFS_I(ip); > > > > > unsigned int xflags = xfs_ip2xflags(ip); > > > > > unsigned int flags = 0; > > > > > > > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > > > > > S_DAX); > > > > > > > > We don't want to clear the dax flag here, ever, if it is already > > > > set. That is an externally visible change and opens us up (again) to > > > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding > > > > clearing the DAX flag was something I did explicitly in the above > > > > code fragment. > > > > > > <sigh> yes... you are correct. > > > > > > But I don't like depending on the caller to clear the S_DAX flag if > > > xfs_inode_enable_dax() is false. IMO this function should clear the flag in > > > that case for consistency... > > > > No. We simply cannot do that here except in the init case when the > > inode is not yet visible to userspace. In which case, we know -for > > certain- that the S_DAX is not set, and hence we do not need to > > clear it. Initial conditions matter! > > > > If you want to make sure of this, add this: > > > > ASSERT(!(IS_DAX(inode) && init)); > > > > And now we'll catch inodes that incorrectly have S_DAX set at init > > time. > > Ok, that will work. Also documents that expected initial condition. > > > > > > > memory S_DAX flag, we can actually clear the on-disk flag > > > > safely, so that next time the inode cycles into memory it won't > > > > be using DAX. IOWs, admins can stop the applications, clear the > > > > DAX flag and drop caches. This should result in the inode being > > > > recycled and when the app is restarted it will run without DAX. > > > > No ned for deleting files, copying large data sets, etc just to > > > > turn off an inode flag. > > > > > > We already discussed evicting the inode and it was determined to > > > be too confusing.[*] > > > > That discussion did not even consider how admins are supposed to > > clear the inode flag once it is set on disk. > > I think this is a bit unfair. I think we were all considering it... and I > still think this patch set is a step in the right direction. > > > It was entirely > > focussed around "we can't change in memory S_DAX state" > > Not true. There were many ideas on how to change the FS_XFLAG_DAX with some > sort of delayed S_DAX state to avoid changing S_DAX on an in memory inode. > > I made the comment: > > "... I want to clarify. ... we could have the flag change with an > appropriate error which could let the user know the change has been > delayed." > > -- https://lore.kernel.org/lkml/20200402205518.GF3952565@iweiny-DESK2.sc.intel.com/ > > Jan made multiple comments: > > "I generally like the proposal but I think the fact that toggling > FS_XFLAG_DAX will not have immediate effect on S_DAX will cause quite > some confusion and ultimately bug reports." > > -- https://lore.kernel.org/lkml/20200401102511.GC19466@quack2.suse.cz/ > > > "Just switch FS_XFLAG_DAX flag, S_DAX flag will magically switch when > inode gets evicted and the inode gets reloaded from the disk again. Did > I misunderstand anything? > > And my thinking was that this is surprising behavior for the user and > so it will likely generate lots of bug reports along the lines of "DAX > inode flag does not work!"." > > -- https://lore.kernel.org/lkml/20200403170338.GD29920@quack2.suse.cz/ > > Darrick also had similar ideas/comments. > > Christoph did say: > > "A reasonably smart application can try to evict itself." > > -- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/ > > Which I was unclear about??? > > Christoph does this mean you would be ok with changing the FS_XFLAG_DAX on disk > and letting S_DAX change later? > > > and how the > > tri-state mount option to "override" the on-disk flag could be done. > > > > Nobody noticed that being unable to rmeove the on-disk flag means > > the admin's only option to turn off dax for an application is to > > turn it off for everything, filesystem wide, which requires: > > No. This is not entirely true. While I don't like the idea of having to copy > data (and I agree with your points) it is possible to do. But now that I think about it, that's really going to be a PITA, and probably more of a pain than if the two DAX flags are only loosely coupled. > > > > 1. stopping the app. > > 2. stopping every other app using the filesystem > > 3. unmounting the filesystem > > 4. changing to dax=never mount option > > I don't understand why we need to unmount and mount with dax=never? I've realized that if you can /never/ clear FS_XFLAG_DAX from a file, then the only way to force it off is dax=never (so the kernel ignores it) or move the fs to a non-pmem storage (so the kernel doesn't even try). > > 5. mounting the filesystem > > 6. restarting all apps. > > > > It's a hard stop for everything using the filesystem, and it changes > > the runtime environment for all applications, not just the one that > > needs DAX turned off. Not to mention that if it's the root > > filesystem that is using DAX, then it's a full system reboot needed > > to change the mount options. > > > > IMO, this is a non-starter from a production point of view - testing > > and qualification of all applications rather than just the affected > > app is required to make this sort of change. It simply does not > > follow the "minimal change to fix the problem" rules for managing > > issues in production environments. > > > > So, pLease explain to me how this process: > > > > 1. stop the app > > 2. remove inode flags via xfs_io > > 3. run drop_caches > > 4. start the app > > > > is worse than requiring admins to unmount the filesystem to turn off > > DAX for an application. > > Jan? Christoph? But you're right, this thing keeps swirling around and around and around because we can't ever get to agreement on this. Maybe I'll just become XFS BOFH MAINTAINER and make a decision like this: 1 Applications must call statx to discover the current S_DAX state. 2 There exists an advisory file inode flag FS_XFLAG_DAX that is set based on the parent directory FS_XFLAG_DAX inode flag. This advisory flag can be changed after file creation, but it does not immediately affect the S_DAX state. If FS_XFLAG_DAX is set and the fs is on pmem then it will enable S_DAX at inode load time; if FS_XFLAG_DAX is not set, it will not enable S_DAX. Unless overridden... 3 There exists a dax= mount option. "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX" "-o dax=always" means "always set S_DAX (at least on pmem), ignore FS_XFLAG_DAX" "-o dax" by itself means "dax=always" "-o dax=iflag" means "follow FS_XFLAG_DAX" and is the default 4 There exists an advisory directory inode flag FS_XFLAG_DAX that can be changed at any time. The flag state is copied into any files or subdirectories when they are created within that directory. If programs require file access runs in S_DAX mode, they must create those files inside a directory with FS_XFLAG_DAX set, or mount the fs with an appropriate dax mount option. 5 Programs that require a specific file access mode (DAX or not DAX) must do one of the following: (a) create files in directories with the FS_XFLAG_DAX flag set as needed; (b) have the administrator set an override via mount option; (c) if they need to change a file's FS_XFLAG_DAX flag so that it does not match the S_DAX state (as reported by statx), they must cause the kernel to evict the inode from memory. This can be done by: i> closing the file; ii> re-opening the file and using statx to see if the fs has changed the S_DAX flag; iii> if not, either unmount and remount the filesystem, or closing the file and using drop_caches. 6 I no longer think it's too wild to require that users who want to squeeze every last bit of performance out of the particular rough and tumble bits of their storage also be exposed to the difficulties of what happens when the operating system can't totally virtualize those hardware capabilities. Your high performance sports car is not a Toyota minivan, as it were. I think (like Dave said) that if you set XFS_IDONTCACHE on the inode when you change the DAX flag, the VFS will kill the inode the instant the last user close()s the file. Then 5.c.ii will actually work. --D > > > > > Furthermore, if we did want an interface like that why not allow > > > the on-disk flag to be set as well as cleared? > > > > Well, why not - it's why I implemented the flag in the first place! > > The only problem we have here is how to safely change the in-memory > > DAX state, and that largely has nothing to do with setting/clearing > > the on-disk flag.... > > With the above change to xfs_diflags_to_iflags() I think we are ok here. > > Ira >
On Wed, Apr 08, 2020 at 05:12:06PM -0700, Ira Weiny wrote: > On Thu, Apr 09, 2020 at 09:21:06AM +1000, Dave Chinner wrote: > > On Wed, Apr 08, 2020 at 03:07:35PM -0700, Ira Weiny wrote: > > > On Thu, Apr 09, 2020 at 07:02:36AM +1000, Dave Chinner wrote: > > > > On Wed, Apr 08, 2020 at 10:09:23AM -0700, Ira Weiny wrote: > > > > > > [snip] > > > > > > > > > > > > > This sounds good but I think we need a slight modification to make the function equivalent in functionality. > > > > > > > > > > void > > > > > xfs_diflags_to_iflags( > > > > > struct xfs_inode *ip, > > > > > bool init) > > > > > { > > > > > struct inode *inode = VFS_I(ip); > > > > > unsigned int xflags = xfs_ip2xflags(ip); > > > > > unsigned int flags = 0; > > > > > > > > > > inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME | > > > > > S_DAX); > > > > > > > > We don't want to clear the dax flag here, ever, if it is already > > > > set. That is an externally visible change and opens us up (again) to > > > > races where IS_DAX() changes half way through a fault path. IOWs, avoiding > > > > clearing the DAX flag was something I did explicitly in the above > > > > code fragment. > > > > > > <sigh> yes... you are correct. > > > > > > But I don't like depending on the caller to clear the S_DAX flag if > > > xfs_inode_enable_dax() is false. IMO this function should clear the flag in > > > that case for consistency... > > > > No. We simply cannot do that here except in the init case when the > > inode is not yet visible to userspace. In which case, we know -for > > certain- that the S_DAX is not set, and hence we do not need to > > clear it. Initial conditions matter! > > > > If you want to make sure of this, add this: > > > > ASSERT(!(IS_DAX(inode) && init)); > > > > And now we'll catch inodes that incorrectly have S_DAX set at init > > time. > > Ok, that will work. Also documents that expected initial condition. > > > > > > > memory S_DAX flag, we can actually clear the on-disk flag > > > > safely, so that next time the inode cycles into memory it won't > > > > be using DAX. IOWs, admins can stop the applications, clear the > > > > DAX flag and drop caches. This should result in the inode being > > > > recycled and when the app is restarted it will run without DAX. > > > > No ned for deleting files, copying large data sets, etc just to > > > > turn off an inode flag. > > > > > > We already discussed evicting the inode and it was determined to > > > be too confusing.[*] > > > > That discussion did not even consider how admins are supposed to > > clear the inode flag once it is set on disk. > > I think this is a bit unfair. I think we were all considering it... and I > still think this patch set is a step in the right direction. > > > It was entirely > > focussed around "we can't change in memory S_DAX state" > > Not true. There were many ideas on how to change the FS_XFLAG_DAX with some > sort of delayed S_DAX state to avoid changing S_DAX on an in memory inode. > > I made the comment: > > "... I want to clarify. ... we could have the flag change with an > appropriate error which could let the user know the change has been > delayed." > > -- https://lore.kernel.org/lkml/20200402205518.GF3952565@iweiny-DESK2.sc.intel.com/ > > Jan made multiple comments: > > "I generally like the proposal but I think the fact that toggling > FS_XFLAG_DAX will not have immediate effect on S_DAX will cause quite > some confusion and ultimately bug reports." > > -- https://lore.kernel.org/lkml/20200401102511.GC19466@quack2.suse.cz/ > > > "Just switch FS_XFLAG_DAX flag, S_DAX flag will magically switch when > inode gets evicted and the inode gets reloaded from the disk again. Did > I misunderstand anything? > > And my thinking was that this is surprising behavior for the user and > so it will likely generate lots of bug reports along the lines of "DAX > inode flag does not work!"." > > -- https://lore.kernel.org/lkml/20200403170338.GD29920@quack2.suse.cz/ > > Darrick also had similar ideas/comments. None of these consider how the admin is supposed to remove the flag once it is set. They all talk about issues that result from setting/clearing the flag inside the kernel, and don't consider the administration impacts of having an unclearable flag on disk that the kernel uses for operational policy decisions. Nobody said "hey, wait, if we don't allow the flag to be cleared once the flag is set, how do we actually remove it so the admin can stop overriding them globally with the mount option? If the kernel can't clear the flag, then what mechanism are we going to provide to let them clear it without interrupting production?" > Christoph did say: > > "A reasonably smart application can try to evict itself." > > -- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/ I'd love to know how an unprivileged application can force the eviction of an inode from cache. If the application cannot evict files it is using reliably, then it's no better solution than drop_caches. And given that userspace has to take references to files to access them, by definition a file that userspace is trying to evict will have active references and hence cannot be evicted. However, if userspace can reliably evict inodes that it can access, then we have a timing attack vector that we need to address. i.e. by now everyone here should know that user controlled cache eviction is the basic requirement for creating most OS and CPU level timing attacks.... > > and how the > > tri-state mount option to "override" the on-disk flag could be done. > > > > Nobody noticed that being unable to rmeove the on-disk flag means > > the admin's only option to turn off dax for an application is to > > turn it off for everything, filesystem wide, which requires: > > No. This is not entirely true. While I don't like the idea of having to copy > data (and I agree with your points) it is possible to do. > > > > > 1. stopping the app. > > 2. stopping every other app using the filesystem > > 3. unmounting the filesystem > > 4. changing to dax=never mount option > > I don't understand why we need to unmount and mount with dax=never? 1. IIUC your patches correctly, that's exactly how you implemented the dax=... mount option. 2. If you don't unmount the filesystem and only require a remount, then changing the mount option has all the same "delayed effect" issues that changing the on-disk flag has. i.e. We can't change the in-memory inode state until the inode has been evicted from cache and a remount doesn't guarantee that cache eviction will succeed. Cheers, Dave.
On Thu, Apr 09, 2020 at 10:49:21AM +1000, Dave Chinner wrote: > > Christoph did say: > > > > "A reasonably smart application can try to evict itself." > > > > -- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/ > > I'd love to know how an unprivileged application can force the > eviction of an inode from cache. Where did the "unprivileged" suddenly come from?
On Wed, Apr 08, 2020 at 05:22:03PM -0700, Ira Weiny wrote: > > You mean something like XFS_IDONTCACHE? > > > > i.e. the functionality already exists in XFS to selectively evict an > > inode from cache when the last reference to it is dropped rather > > than let it go to the LRUs and hang around in memory. > > > > That flag can be set when changing the on disk DAX flag, and we can > > tweak how it works so new cache hits don't clear it (as happens > > now). Hence the only thing that can prevent eviction are active > > references. > > > > That means we'll still need to stop the application and drop_caches, > > because we need to close all the files and purge the dentries that > > hold references to the inode before it can be evicted. > > That sounds like a great idea... > > Jan? Christoph? Sounds ok. Note that we could also pretty trivially lift XFS_IDONTCACHE to the VFS if we need to apply the same scheme to ext4.
On Wed, Apr 08, 2020 at 05:30:21PM -0700, Darrick J. Wong wrote: [snip] > > But you're right, this thing keeps swirling around and around and around > because we can't ever get to agreement on this. Maybe I'll just become > XFS BOFH MAINTAINER and make a decision like this: > > 1 Applications must call statx to discover the current S_DAX state. > > 2 There exists an advisory file inode flag FS_XFLAG_DAX that is set based on > the parent directory FS_XFLAG_DAX inode flag. This advisory flag can be > changed after file creation, but it does not immediately affect the S_DAX > state. > > If FS_XFLAG_DAX is set and the fs is on pmem then it will enable S_DAX at > inode load time; if FS_XFLAG_DAX is not set, it will not enable S_DAX. > Unless overridden... > > 3 There exists a dax= mount option. > > "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX" > "-o dax=always" means "always set S_DAX (at least on pmem), ignore FS_XFLAG_DAX" > "-o dax" by itself means "dax=always" > "-o dax=iflag" means "follow FS_XFLAG_DAX" and is the default per-Dave '-o dax=inode' > > 4 There exists an advisory directory inode flag FS_XFLAG_DAX that can be > changed at any time. The flag state is copied into any files or > subdirectories when they are created within that directory. Good. > If programs > require file access runs in S_DAX mode, they must create those files > inside a directory with FS_XFLAG_DAX set, or mount the fs with an > appropriate dax mount option. Why do we need this to be true? If the FS_XFLAG_DAX flag can be cleared why not set it and allow the S_DAX change to occur later just like clearing it? The logic is exactly the same. > > 5 Programs that require a specific file access mode (DAX or not DAX) must s/must/can/ > do one of the following: > > (a) create files in directories with the FS_XFLAG_DAX flag set as needed; Again if we allow clearing the flag why not setting? So this is 1 option they 'can' do. > > (b) have the administrator set an override via mount option; > > (c) if they need to change a file's FS_XFLAG_DAX flag so that it does not > match the S_DAX state (as reported by statx), they must cause the > kernel to evict the inode from memory. This can be done by: > > i> closing the file; > ii> re-opening the file and using statx to see if the fs has > changed the S_DAX flag; i and ii need to be 1 step the user must follow. > iii> if not, either unmount and remount the filesystem, or > closing the file and using drop_caches. > > 6 I no longer think it's too wild to require that users who want to > squeeze every last bit of performance out of the particular rough and > tumble bits of their storage also be exposed to the difficulties of > what happens when the operating system can't totally virtualize those > hardware capabilities. Your high performance sports car is not a > Toyota minivan, as it were. I'm good with this statement. But I think we need to clean up the verbiage for the documentation... ;-) Thanks for the summary. I like these to get everyone on the same page. :-D Ira > > I think (like Dave said) that if you set XFS_IDONTCACHE on the inode > when you change the DAX flag, the VFS will kill the inode the instant > the last user close()s the file. Then 5.c.ii will actually work. > > --D > > > > > > > > Furthermore, if we did want an interface like that why not allow > > > > the on-disk flag to be set as well as cleared? > > > > > > Well, why not - it's why I implemented the flag in the first place! > > > The only problem we have here is how to safely change the in-memory > > > DAX state, and that largely has nothing to do with setting/clearing > > > the on-disk flag.... > > > > With the above change to xfs_diflags_to_iflags() I think we are ok here. > > > > Ira > >
On Thu, Apr 09, 2020 at 08:29:44AM -0700, Ira Weiny wrote: > On Wed, Apr 08, 2020 at 05:30:21PM -0700, Darrick J. Wong wrote: > > [snip] > > > > > But you're right, this thing keeps swirling around and around and around > > because we can't ever get to agreement on this. Maybe I'll just become > > XFS BOFH MAINTAINER and make a decision like this: > > > > 1 Applications must call statx to discover the current S_DAX state. > > > > 2 There exists an advisory file inode flag FS_XFLAG_DAX that is set based on > > the parent directory FS_XFLAG_DAX inode flag. This advisory flag can be > > changed after file creation, but it does not immediately affect the S_DAX > > state. > > > > If FS_XFLAG_DAX is set and the fs is on pmem then it will enable S_DAX at > > inode load time; if FS_XFLAG_DAX is not set, it will not enable S_DAX. > > Unless overridden... > > > > 3 There exists a dax= mount option. > > > > "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX" > > "-o dax=always" means "always set S_DAX (at least on pmem), ignore FS_XFLAG_DAX" > > "-o dax" by itself means "dax=always" > > "-o dax=iflag" means "follow FS_XFLAG_DAX" and is the default > > per-Dave '-o dax=inode' Ok. > > > > 4 There exists an advisory directory inode flag FS_XFLAG_DAX that can be > > changed at any time. The flag state is copied into any files or > > subdirectories when they are created within that directory. > > Good. > > > If programs > > require file access runs in S_DAX mode, they must create those files > > inside a directory with FS_XFLAG_DAX set, or mount the fs with an > > appropriate dax mount option. > > Why do we need this to be true? If the FS_XFLAG_DAX flag can be cleared why > not set it and allow the S_DAX change to occur later just like clearing it? > The logic is exactly the same. I think I'll just delete this sentence since I started pushing all that verbiage towards (5). To answer your question, yes, FS_XFLAG_DAX can be set on a file at any time, the same as it can be cleared at any time. Sorry that was unclear, I'll fix that for the next draft (below). > > > > 5 Programs that require a specific file access mode (DAX or not DAX) must > > s/must/can/ Ok. > > do one of the following: > > > > (a) create files in directories with the FS_XFLAG_DAX flag set as needed; > > Again if we allow clearing the flag why not setting? So this is 1 option they > 'can' do. > > > > > (b) have the administrator set an override via mount option; > > > > (c) if they need to change a file's FS_XFLAG_DAX flag so that it does not > > match the S_DAX state (as reported by statx), they must cause the > > kernel to evict the inode from memory. This can be done by: > > > > i> closing the file; > > ii> re-opening the file and using statx to see if the fs has > > changed the S_DAX flag; > > i and ii need to be 1 step the user must follow. Ok, I'll combine these. > > iii> if not, either unmount and remount the filesystem, or > > closing the file and using drop_caches. > > > > 6 I no longer think it's too wild to require that users who want to > > squeeze every last bit of performance out of the particular rough and > > tumble bits of their storage also be exposed to the difficulties of > > what happens when the operating system can't totally virtualize those > > hardware capabilities. Your high performance sports car is not a > > Toyota minivan, as it were. > > I'm good with this statement. But I think we need to clean up the verbiage for > the documentation... ;-) Heh. :) > Thanks for the summary. I like these to get everyone on the same page. :-D And today: 1. There exists an in-kernel access mode flag S_DAX that is set when file accesses go directly to persistent memory, bypassing the page cache. Applications must call statx to discover the current S_DAX state (STATX_ATTR_DAX). 2. There exists an advisory file inode flag FS_XFLAG_DAX that is inherited from the parent directory FS_XFLAG_DAX inode flag at file creation time. This advisory flag can be set or cleared at any time, but doing so does not immediately affect the S_DAX state. Unless overridden by mount options (see (3)), if FS_XFLAG_DAX is set and the fs is on pmem then it will enable S_DAX at inode load time; if FS_XFLAG_DAX is not set, it will not enable S_DAX. 3. There exists a dax= mount option. "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX." "-o dax=always" means "always set S_DAX (at least on pmem), and ignore FS_XFLAG_DAX." "-o dax" is an alias for "dax=always". "-o dax=inode" means "follow FS_XFLAG_DAX" and is the default. 4. There exists an advisory directory inode flag FS_XFLAG_DAX that can be set or cleared at any time. The flag state is copied into any files or subdirectories when they are created within that directory. 5. Programs that require a specific file access mode (DAX or not DAX) can do one of the following: (a) Create files in directories that the FS_XFLAG_DAX flag set as needed; or (b) Have the administrator set an override via mount option; or (c) Set or clear the file's FS_XFLAG_DAX flag as needed. Programs must then cause the kernel to evict the inode from memory. This can be done by: i> Closing the file and re-opening the file and using statx to see if the fs has changed the S_DAX flag; and ii> If the file still does not have the desired S_DAX access mode, either unmount and remount the filesystem, or close the file and use drop_caches. 6. It's not unreasonable that users who want to squeeze every last bit of performance out of the particular rough and tumble bits of their storage also be exposed to the difficulties of what happens when the operating system can't totally virtualize those hardware capabilities. Your high performance sports car is not a Toyota minivan, as it were. Given our overnight discussions, I don't think it'll be difficult to hoist XFS_IDONTCACHE to the VFS so that 5.c.i is enough to change the S_DAX state if nobody else is using the file. --D > > > > > Furthermore, if we did want an interface like that why not allow > > > the on-disk flag to be set as well as cleared? > > > > Well, why not - it's why I implemented the flag in the first place! > > The only problem we have here is how to safely change the in-memory > > DAX state, and that largely has nothing to do with setting/clearing > > the on-disk flag.... > > With the above change to xfs_diflags_to_iflags() I think we are ok here. > > Ira > --D > Ira > > > > > I think (like Dave said) that if you set XFS_IDONTCACHE on the inode > > when you change the DAX flag, the VFS will kill the inode the instant > > the last user close()s the file. Then 5.c.ii will actually work. > > > > --D > > > > > > > > > > > Furthermore, if we did want an interface like that why not allow > > > > > the on-disk flag to be set as well as cleared? > > > > > > > > Well, why not - it's why I implemented the flag in the first place! > > > > The only problem we have here is how to safely change the in-memory > > > > DAX state, and that largely has nothing to do with setting/clearing > > > > the on-disk flag.... > > > > > > With the above change to xfs_diflags_to_iflags() I think we are ok here. > > > > > > Ira > > >
On Thu 09-04-20 09:59:27, Darrick J. Wong wrote: > And today: > > 1. There exists an in-kernel access mode flag S_DAX that is set when > file accesses go directly to persistent memory, bypassing the page > cache. Applications must call statx to discover the current S_DAX > state (STATX_ATTR_DAX). > > 2. There exists an advisory file inode flag FS_XFLAG_DAX that is > inherited from the parent directory FS_XFLAG_DAX inode flag at file > creation time. This advisory flag can be set or cleared at any > time, but doing so does not immediately affect the S_DAX state. > > Unless overridden by mount options (see (3)), if FS_XFLAG_DAX is set > and the fs is on pmem then it will enable S_DAX at inode load time; > if FS_XFLAG_DAX is not set, it will not enable S_DAX. > > 3. There exists a dax= mount option. > > "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX." > > "-o dax=always" means "always set S_DAX (at least on pmem), > and ignore FS_XFLAG_DAX." > > "-o dax" is an alias for "dax=always". > > "-o dax=inode" means "follow FS_XFLAG_DAX" and is the default. > > 4. There exists an advisory directory inode flag FS_XFLAG_DAX that can > be set or cleared at any time. The flag state is copied into any > files or subdirectories when they are created within that directory. > > 5. Programs that require a specific file access mode (DAX or not DAX) > can do one of the following: > > (a) Create files in directories that the FS_XFLAG_DAX flag set as > needed; or > > (b) Have the administrator set an override via mount option; or > > (c) Set or clear the file's FS_XFLAG_DAX flag as needed. Programs > must then cause the kernel to evict the inode from memory. This > can be done by: > > i> Closing the file and re-opening the file and using statx to > see if the fs has changed the S_DAX flag; and > > ii> If the file still does not have the desired S_DAX access > mode, either unmount and remount the filesystem, or close > the file and use drop_caches. > > 6. It's not unreasonable that users who want to squeeze every last bit > of performance out of the particular rough and tumble bits of their > storage also be exposed to the difficulties of what happens when the > operating system can't totally virtualize those hardware > capabilities. Your high performance sports car is not a Toyota > minivan, as it were. > > Given our overnight discussions, I don't think it'll be difficult to > hoist XFS_IDONTCACHE to the VFS so that 5.c.i is enough to change the > S_DAX state if nobody else is using the file. I still find the "S_DAX changes on inode eviction" confusing but I guess it's as good as it gets and with XFS_IDONTCACHE lifted to VFS, it seems acceptable so OK from me. Honza
On Thu, Apr 09, 2020 at 02:41:27PM +0200, Christoph Hellwig wrote: > On Wed, Apr 08, 2020 at 05:22:03PM -0700, Ira Weiny wrote: > > > You mean something like XFS_IDONTCACHE? > > > > > > i.e. the functionality already exists in XFS to selectively evict an > > > inode from cache when the last reference to it is dropped rather > > > than let it go to the LRUs and hang around in memory. > > > > > > That flag can be set when changing the on disk DAX flag, and we can > > > tweak how it works so new cache hits don't clear it (as happens > > > now). Hence the only thing that can prevent eviction are active > > > references. > > > > > > That means we'll still need to stop the application and drop_caches, > > > because we need to close all the files and purge the dentries that > > > hold references to the inode before it can be evicted. > > > > That sounds like a great idea... > > > > Jan? Christoph? > > Sounds ok. Note that we could also pretty trivially lift > XFS_IDONTCACHE to the VFS if we need to apply the same scheme to > ext4. Yes I have been slowing working on ext4 in the background. So lifting XXX_IDONTCACHE to VFS will be part of that series. Thanks, Ira
On Thu, Apr 09, 2020 at 09:59:27AM -0700, Darrick J. Wong wrote: [snip] > And today: > > 1. There exists an in-kernel access mode flag S_DAX that is set when > file accesses go directly to persistent memory, bypassing the page > cache. Applications must call statx to discover the current S_DAX > state (STATX_ATTR_DAX). > > 2. There exists an advisory file inode flag FS_XFLAG_DAX that is > inherited from the parent directory FS_XFLAG_DAX inode flag at file > creation time. This advisory flag can be set or cleared at any > time, but doing so does not immediately affect the S_DAX state. > > Unless overridden by mount options (see (3)), if FS_XFLAG_DAX is set > and the fs is on pmem then it will enable S_DAX at inode load time; > if FS_XFLAG_DAX is not set, it will not enable S_DAX. > > 3. There exists a dax= mount option. > > "-o dax=never" means "never set S_DAX, ignore FS_XFLAG_DAX." > > "-o dax=always" means "always set S_DAX (at least on pmem), > and ignore FS_XFLAG_DAX." > > "-o dax" is an alias for "dax=always". > > "-o dax=inode" means "follow FS_XFLAG_DAX" and is the default. > > 4. There exists an advisory directory inode flag FS_XFLAG_DAX that can > be set or cleared at any time. The flag state is copied into any > files or subdirectories when they are created within that directory. > > 5. Programs that require a specific file access mode (DAX or not DAX) > can do one of the following: > > (a) Create files in directories that the FS_XFLAG_DAX flag set as > needed; or > > (b) Have the administrator set an override via mount option; or > > (c) Set or clear the file's FS_XFLAG_DAX flag as needed. Programs > must then cause the kernel to evict the inode from memory. This > can be done by: > > i> Closing the file and re-opening the file and using statx to > see if the fs has changed the S_DAX flag; and > > ii> If the file still does not have the desired S_DAX access > mode, either unmount and remount the filesystem, or close > the file and use drop_caches. > > 6. It's not unreasonable that users who want to squeeze every last bit > of performance out of the particular rough and tumble bits of their > storage also be exposed to the difficulties of what happens when the > operating system can't totally virtualize those hardware > capabilities. Your high performance sports car is not a Toyota > minivan, as it were. > > Given our overnight discussions, I don't think it'll be difficult to > hoist XFS_IDONTCACHE to the VFS so that 5.c.i is enough to change the > S_DAX state if nobody else is using the file. Agreed! One note on implementation, I plan to get XFS_IDONTCACHE tested with XFS and leave hoisting it to VFS for the series which enables ext4 as that is when we would need such hoisting. Thanks everyone for another round of discussions! :-D Ira
On Thu, Apr 09, 2020 at 02:40:31PM +0200, Christoph Hellwig wrote: > On Thu, Apr 09, 2020 at 10:49:21AM +1000, Dave Chinner wrote: > > > Christoph did say: > > > > > > "A reasonably smart application can try to evict itself." > > > > > > -- https://lore.kernel.org/lkml/20200403072731.GA24176@lst.de/ > > > > I'd love to know how an unprivileged application can force the > > eviction of an inode from cache. > > Where did the "unprivileged" suddenly come from? I'm assuming that applications are being run without the root permissions needed to run drop_caches. i.e. the apps are unprivileged, and therefore can't brute force inode cache eviction. That's why I'm asking what mechanism these applications are using to evict inodes on demand without requiring elevated privileges, because I can't see how they'd acheive this... Cheers, Dave.
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 492e53992fa9..e76ed9ca17f7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -466,6 +466,7 @@ int xfs_break_layouts(struct inode *inode, uint *iolock, /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); /* * When setting up a newly allocated inode, we need to call diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42de92cb283..c6cd92ef4a05 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1100,37 +1100,6 @@ xfs_flags2diflags2( return di_flags2; } -STATIC void -xfs_diflags_to_linux( - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - unsigned int xflags = xfs_ip2xflags(ip); - - if (xflags & FS_XFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (xflags & FS_XFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (xflags & FS_XFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (xflags & FS_XFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; -#if 0 /* disabled until the flag switching races are sorted out */ - if (xflags & FS_XFLAG_DAX) - inode->i_flags |= S_DAX; - else - inode->i_flags &= ~S_DAX; -#endif -} - static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, @@ -1168,7 +1137,7 @@ xfs_ioctl_setattr_xflags( ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_d.di_flags2 = di_flags2; - xfs_diflags_to_linux(ip); + xfs_diflags_to_iflags(ip, false); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e07f7b641226..a4ac8568c8c7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1259,7 +1259,7 @@ xfs_inode_supports_dax( return xfs_inode_buftarg(ip)->bt_daxdev != NULL; } -STATIC bool +static bool xfs_inode_enable_dax( struct xfs_inode *ip) { @@ -1272,26 +1272,38 @@ xfs_inode_enable_dax( return false; } -STATIC void +void xfs_diflags_to_iflags( - struct inode *inode, - struct xfs_inode *ip) + struct xfs_inode *ip, + bool init) { - uint16_t flags = ip->i_d.di_flags; - - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | - S_NOATIME | S_DAX); + struct inode *inode = VFS_I(ip); + uint diflags = xfs_ip2xflags(ip); - if (flags & XFS_DIFLAG_IMMUTABLE) + if (diflags & FS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; - if (flags & XFS_DIFLAG_APPEND) + else + inode->i_flags &= ~S_IMMUTABLE; + if (diflags & FS_XFLAG_APPEND) inode->i_flags |= S_APPEND; - if (flags & XFS_DIFLAG_SYNC) + else + inode->i_flags &= ~S_APPEND; + if (diflags & FS_XFLAG_SYNC) inode->i_flags |= S_SYNC; - if (flags & XFS_DIFLAG_NOATIME) + else + inode->i_flags &= ~S_SYNC; + if (diflags & FS_XFLAG_NOATIME) inode->i_flags |= S_NOATIME; - if (xfs_inode_enable_dax(ip)) - inode->i_flags |= S_DAX; + else + inode->i_flags &= ~S_NOATIME; + + /* Only toggle the dax flag when initializing */ + if (init) { + if (xfs_inode_enable_dax(ip)) + inode->i_flags |= S_DAX; + else + inode->i_flags &= ~S_DAX; + } } /* @@ -1320,7 +1332,7 @@ xfs_setup_inode( inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); i_size_write(inode, ip->i_d.di_size); - xfs_diflags_to_iflags(inode, ip); + xfs_diflags_to_iflags(ip, true); if (S_ISDIR(inode->i_mode)) { /*