diff mbox series

[v2,05/18] xfs: Add xfs_break_layouts() to the inode eviction path

Message ID 166329933874.2786261.18236541386474985669.stgit@dwillia2-xfh.jf.intel.com (mailing list archive)
State New
Headers show
Series Fix the DAX-gup mistake | expand

Commit Message

Dan Williams Sept. 16, 2022, 3:35 a.m. UTC
In preparation for moving DAX pages to be 0-based rather than 1-based
for the idle refcount, the fsdax core wants to have all mappings in a
"zapped" state before truncate. For typical pages this happens naturally
via unmap_mapping_range(), for DAX pages some help is needed to record
this state in the 'struct address_space' of the inode(s) where the page
is mapped.

That "zapped" state is recorded in DAX entries as a side effect of
xfs_break_layouts(). Arrange for it to be called before all truncation
events which already happens for truncate() and PUNCH_HOLE, but not
truncate_inode_pages_final(). Arrange for xfs_break_layouts() before
truncate_inode_pages_final().

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c  |   13 +++++++++----
 fs/xfs/xfs_inode.c |    3 ++-
 fs/xfs/xfs_inode.h |    6 ++++--
 fs/xfs/xfs_super.c |   22 ++++++++++++++++++++++
 4 files changed, 37 insertions(+), 7 deletions(-)

Comments

Dan Williams Sept. 19, 2022, 4:11 p.m. UTC | #1
Dave Chinner wrote:
> On Thu, Sep 15, 2022 at 08:35:38PM -0700, Dan Williams wrote:
> > In preparation for moving DAX pages to be 0-based rather than 1-based
> > for the idle refcount, the fsdax core wants to have all mappings in a
> > "zapped" state before truncate. For typical pages this happens naturally
> > via unmap_mapping_range(), for DAX pages some help is needed to record
> > this state in the 'struct address_space' of the inode(s) where the page
> > is mapped.
> > 
> > That "zapped" state is recorded in DAX entries as a side effect of
> > xfs_break_layouts(). Arrange for it to be called before all truncation
> > events which already happens for truncate() and PUNCH_HOLE, but not
> > truncate_inode_pages_final(). Arrange for xfs_break_layouts() before
> > truncate_inode_pages_final().
> 
> Ugh. That's nasty and awful.
> 
> 
> 
> > Cc: Matthew Wilcox <willy@infradead.org>
> > Cc: Jan Kara <jack@suse.cz>
> > Cc: "Darrick J. Wong" <djwong@kernel.org>
> > Cc: Jason Gunthorpe <jgg@nvidia.com>
> > Cc: Christoph Hellwig <hch@lst.de>
> > Cc: John Hubbard <jhubbard@nvidia.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > ---
> >  fs/xfs/xfs_file.c  |   13 +++++++++----
> >  fs/xfs/xfs_inode.c |    3 ++-
> >  fs/xfs/xfs_inode.h |    6 ++++--
> >  fs/xfs/xfs_super.c |   22 ++++++++++++++++++++++
> >  4 files changed, 37 insertions(+), 7 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 556e28d06788..d3ff692d5546 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -816,7 +816,8 @@ xfs_wait_dax_page(
> >  int
> >  xfs_break_dax_layouts(
> >  	struct inode		*inode,
> > -	bool			*retry)
> > +	bool			*retry,
> > +	int			state)
> >  {
> >  	struct page		*page;
> >  
> > @@ -827,8 +828,8 @@ xfs_break_dax_layouts(
> >  		return 0;
> >  
> >  	*retry = true;
> > -	return ___wait_var_event(page, dax_page_idle(page), TASK_INTERRUPTIBLE,
> > -				 0, 0, xfs_wait_dax_page(inode));
> > +	return ___wait_var_event(page, dax_page_idle(page), state, 0, 0,
> > +				 xfs_wait_dax_page(inode));
> >  }
> >  
> >  int
> > @@ -839,14 +840,18 @@ xfs_break_layouts(
> >  {
> >  	bool			retry;
> >  	int			error;
> > +	int			state = TASK_INTERRUPTIBLE;
> >  
> >  	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
> >  
> >  	do {
> >  		retry = false;
> >  		switch (reason) {
> > +		case BREAK_UNMAP_FINAL:
> > +			state = TASK_UNINTERRUPTIBLE;
> > +			fallthrough;
> >  		case BREAK_UNMAP:
> > -			error = xfs_break_dax_layouts(inode, &retry);
> > +			error = xfs_break_dax_layouts(inode, &retry, state);
> >  			if (error || retry)
> >  				break;
> >  			fallthrough;
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 28493c8e9bb2..72ce1cb72736 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -3452,6 +3452,7 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
> >  	struct xfs_inode	*ip1,
> >  	struct xfs_inode	*ip2)
> >  {
> > +	int			state = TASK_INTERRUPTIBLE;
> >  	int			error;
> >  	bool			retry;
> >  	struct page		*page;
> > @@ -3463,7 +3464,7 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
> >  	retry = false;
> >  	/* Lock the first inode */
> >  	xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
> > -	error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
> > +	error = xfs_break_dax_layouts(VFS_I(ip1), &retry, state);
> >  	if (error || retry) {
> >  		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
> >  		if (error == 0 && retry)
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index fa780f08dc89..e4994eb6e521 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -454,11 +454,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
> >   * layout-holder has a consistent view of the file's extent map. While
> >   * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
> >   * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
> > - * go idle.
> > + * go idle. BREAK_UNMAP_FINAL is an uninterruptible version of
> > + * BREAK_UNMAP.
> >   */
> >  enum layout_break_reason {
> >          BREAK_WRITE,
> >          BREAK_UNMAP,
> > +        BREAK_UNMAP_FINAL,
> >  };
> >  
> >  /*
> > @@ -531,7 +533,7 @@ xfs_itruncate_extents(
> >  }
> >  
> >  /* from xfs_file.c */
> > -int	xfs_break_dax_layouts(struct inode *inode, bool *retry);
> > +int	xfs_break_dax_layouts(struct inode *inode, bool *retry, int state);
> >  int	xfs_break_layouts(struct inode *inode, uint *iolock,
> >  		enum layout_break_reason reason);
> >  
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index 9ac59814bbb6..ebb4a6eba3fc 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -725,6 +725,27 @@ xfs_fs_drop_inode(
> >  	return generic_drop_inode(inode);
> >  }
> >  
> > +STATIC void
> > +xfs_fs_evict_inode(
> > +	struct inode		*inode)
> > +{
> > +	struct xfs_inode	*ip = XFS_I(inode);
> > +	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> > +	long			error;
> > +
> > +	xfs_ilock(ip, iolock);
> 
> I'm guessing you never ran this through lockdep.

I always run with lockdep enabled in my development kernels, but maybe my
testing was insufficient? Somewhat moot with your concerns below...

> The general rule is that XFS should not take inode locks directly in
> the inode eviction path because lockdep tends to throw all manner of
> memory reclaim related false positives when we do this. We most
> definitely don't want to be doing this for anything other than
> regular files that are DAX enabled, yes?

Guilty. I sought to satisfy the locking expectations of the
break_layouts internals rather than drop the unnecessary locking.

> 
> We also don't want to arbitrarily block memory reclaim for long
> periods of time waiting on an inode lock.  People seem to get very
> upset when we introduce unbound latencies into the memory reclaim
> path...
> 
> Indeed, what are you actually trying to serialise against here?
> Nothing should have a reference to the inode, nor should anything be
> able to find and take a new reference to the inode while it is being
> evicted....

Ok.

> 
> > +	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP_FINAL);
> > +
> > +	/* The final layout break is uninterruptible */
> > +	ASSERT_ALWAYS(!error);
> 
> We don't do error handling with BUG(). If xfs_break_layouts() truly
> can't fail (what happens if the fs is shut down and some internal
> call path now detects that and returns -EFSCORRUPTED?), theni
> WARN_ON_ONCE() and continuing to tear down the inode so the system
> is not immediately compromised is the appropriate action here.
> 
> > +
> > +	truncate_inode_pages_final(&inode->i_data);
> > +	clear_inode(inode);
> > +
> > +	xfs_iunlock(ip, iolock);
> > +}
> 
> That all said, this really looks like a bit of a band-aid.

It definitely is since DAX is in this transitory state between doing
some activities page-less and others with page metadata. If DAX was
fully committed to behaving like a typical page then
unmap_mapping_range() would have already satisfied this reference
counting situation.

> I can't work out why would we we ever have an actual layout lease
> here that needs breaking given they are file based and active files
> hold a reference to the inode. If we ever break that, then I suspect
> this change will cause major problems for anyone using pNFS with XFS
> as xfs_break_layouts() can end up waiting for NFS delegation
> revocation. This is something we should never be doing in inode
> eviction/memory reclaim.
> 
> Hence I have to ask why this lease break is being done
> unconditionally for all inodes, instead of only calling
> xfs_break_dax_layouts() directly on DAX enabled regular files?  I
> also wonder what exciting new system deadlocks this will create
> because BREAK_UNMAP_FINAL can essentially block forever waiting on
> dax mappings going away. If that DAX mapping reclaim requires memory
> allocations.....

There should be no memory allocations in the DAX mapping reclaim path.
Also, the page pins it waits for are precluded from being GUP_LONGTERM.

> 
> /me looks deeper into the dax_layout_busy_page() stuff and realises
> that both ext4 and XFS implementations of ext4_break_layouts() and
> xfs_break_dax_layouts() are actually identical.
> 
> That is, filemap_invalidate_unlock() and xfs_iunlock(ip,
> XFS_MMAPLOCK_EXCL) operate on exactly the same
> inode->i_mapping->invalidate_lock. Hence the implementations in ext4
> and XFS are both functionally identical.

I assume you mean for the purposes of this "final" break since
xfs_file_allocate() holds XFS_IOLOCK_EXCL over xfs_break_layouts().

> Further, when the inode is
> in the eviction path there is no reason for needing to take that
> mapping->invalidation_lock to invalidate remaining stale DAX
> mappings before truncate blasts them away.
> 
> IOWs, I don't see why fixing this problem needs to add new code to
> XFS or ext4 at all. The DAX mapping invalidation and waiting can be
> done enitrely within truncate_inode_pages_final() (conditional on
> IS_DAX()) after mapping_set_exiting() has been set with generic code
> and it should not require locking at all. I also think that
> ext4_break_layouts() and xfs_break_dax_layouts() should be merged
> into a generic dax infrastructure function so the filesystems don't
> need to care about the internal details of DAX mappings at all...

Yes, I think I can make that happen. Thanks Dave.
Dan Williams Sept. 20, 2022, 4:44 p.m. UTC | #2
Dave Chinner wrote:
> On Mon, Sep 19, 2022 at 09:11:48AM -0700, Dan Williams wrote:
> > Dave Chinner wrote:
> > > On Thu, Sep 15, 2022 at 08:35:38PM -0700, Dan Williams wrote:
> > > > In preparation for moving DAX pages to be 0-based rather than 1-based
> > > > for the idle refcount, the fsdax core wants to have all mappings in a
> > > > "zapped" state before truncate. For typical pages this happens naturally
> > > > via unmap_mapping_range(), for DAX pages some help is needed to record
> > > > this state in the 'struct address_space' of the inode(s) where the page
> > > > is mapped.
> > > > 
> > > > That "zapped" state is recorded in DAX entries as a side effect of
> > > > xfs_break_layouts(). Arrange for it to be called before all truncation
> > > > events which already happens for truncate() and PUNCH_HOLE, but not
> > > > truncate_inode_pages_final(). Arrange for xfs_break_layouts() before
> > > > truncate_inode_pages_final().
> ....
> > > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > > index 9ac59814bbb6..ebb4a6eba3fc 100644
> > > > --- a/fs/xfs/xfs_super.c
> > > > +++ b/fs/xfs/xfs_super.c
> > > > @@ -725,6 +725,27 @@ xfs_fs_drop_inode(
> > > >  	return generic_drop_inode(inode);
> > > >  }
> > > >  
> > > > +STATIC void
> > > > +xfs_fs_evict_inode(
> > > > +	struct inode		*inode)
> > > > +{
> > > > +	struct xfs_inode	*ip = XFS_I(inode);
> > > > +	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> > > > +	long			error;
> > > > +
> > > > +	xfs_ilock(ip, iolock);
> > > 
> > > I'm guessing you never ran this through lockdep.
> > 
> > I always run with lockdep enabled in my development kernels, but maybe my
> > testing was insufficient? Somewhat moot with your concerns below...
> 
> I'm guessing your testing doesn't generate inode cache pressure and
> then have direct memory reclaim inodes. e.g. on a directory inode
> this will trigger lockdep immediately because readdir locks with
> XFS_IOLOCK_SHARED and then does GFP_KERNEL memory reclaim. If we try
> to take XFS_IOLOCK_EXCL from memory reclaim of directory inodes,
> lockdep will then shout from the rooftops...

Got it.

> 
> > > > +
> > > > +	truncate_inode_pages_final(&inode->i_data);
> > > > +	clear_inode(inode);
> > > > +
> > > > +	xfs_iunlock(ip, iolock);
> > > > +}
> > > 
> > > That all said, this really looks like a bit of a band-aid.
> > 
> > It definitely is since DAX is in this transitory state between doing
> > some activities page-less and others with page metadata. If DAX was
> > fully committed to behaving like a typical page then
> > unmap_mapping_range() would have already satisfied this reference
> > counting situation.
> > 
> > > I can't work out why would we we ever have an actual layout lease
> > > here that needs breaking given they are file based and active files
> > > hold a reference to the inode. If we ever break that, then I suspect
> > > this change will cause major problems for anyone using pNFS with XFS
> > > as xfs_break_layouts() can end up waiting for NFS delegation
> > > revocation. This is something we should never be doing in inode
> > > eviction/memory reclaim.
> > > 
> > > Hence I have to ask why this lease break is being done
> > > unconditionally for all inodes, instead of only calling
> > > xfs_break_dax_layouts() directly on DAX enabled regular files?  I
> > > also wonder what exciting new system deadlocks this will create
> > > because BREAK_UNMAP_FINAL can essentially block forever waiting on
> > > dax mappings going away. If that DAX mapping reclaim requires memory
> > > allocations.....
> > 
> > There should be no memory allocations in the DAX mapping reclaim path.
> > Also, the page pins it waits for are precluded from being GUP_LONGTERM.
> 
> So if the task that holds the pin needs memory allocation before it
> can unpin the page to allow direct inode reclaim to make progress?

No, it couldn't, and I realize now that GUP_LONGTERM has nothing to do
with this hang since any GFP_KERNEL in a path that took a DAX page pin
path could run afoul of this need to wait.

So, this has me looking at invalidate_inodes() and iput_final(), where I
did not see the reclaim entanglement, and thinking DAX has the unique
requirement to make sure that no access to a page outlives the hosting
inode.

Not that I need to tell you, but to get my own thinking straight,
compare that to typical page cache as the pinner can keep a pinned
page-cache page as long as it wants even after it has been truncated.
DAX needs to make sure that truncate_inode_pages() ceases all access to
the page synchronous with the truncate. The typical page-cache will
ensure that the next mapping of the file will get a new page if the page
previously pinned for that offset is still in use, DAX can not offer
that as the same page that was previously pinned is always used.

So I think this means something like this:

diff --git a/fs/inode.c b/fs/inode.c
index 6462276dfdf0..ab16772b9a8d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -784,6 +784,11 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
                        continue;
                }
 
+               if (dax_inode_busy(inode)) {
+                       busy = 1;
+                       continue;
+               }
+
                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
@@ -1733,6 +1738,8 @@ static void iput_final(struct inode *inode)
                spin_unlock(&inode->i_lock);
 
                write_inode_now(inode, 1);
+               if (IS_DAX(inode))
+                       dax_break_layouts(inode);
 
                spin_lock(&inode->i_lock);
                state = inode->i_state;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..e4a74ab310b5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3028,8 +3028,20 @@ extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
 extern int generic_delete_inode(struct inode *inode);
+
+static inline bool dax_inode_busy(struct inode *inode)
+{
+       if (!IS_DAX(inode))
+               return false;
+
+       return dax_zap_pages(inode) != NULL;
+}
+
 static inline int generic_drop_inode(struct inode *inode)
 {
+       if (dax_inode_busy(inode))
+               return 0;
+
        return !inode->i_nlink || inode_unhashed(inode);
 }
 extern void d_mark_dontcache(struct inode *inode);

...where generic code skips over dax-inodes with pinned pages.

> 
> > > /me looks deeper into the dax_layout_busy_page() stuff and realises
> > > that both ext4 and XFS implementations of ext4_break_layouts() and
> > > xfs_break_dax_layouts() are actually identical.
> > > 
> > > That is, filemap_invalidate_unlock() and xfs_iunlock(ip,
> > > XFS_MMAPLOCK_EXCL) operate on exactly the same
> > > inode->i_mapping->invalidate_lock. Hence the implementations in ext4
> > > and XFS are both functionally identical.
> > 
> > I assume you mean for the purposes of this "final" break since
> > xfs_file_allocate() holds XFS_IOLOCK_EXCL over xfs_break_layouts().
> 
> No, I'm just looking at the two *dax* functions - we don't care what
> locks xfs_break_layouts() requires - dax mapping manipulation is
> covered by the mapping->invalidate_lock and not the inode->i_rwsem.
> This is explicitly documented in the code by the the asserts in both
> ext4_break_layouts() and xfs_break_dax_layouts().
> 
> XFS holds the inode->i_rwsem over xfs_break_layouts() because we
> have to break *file layout leases* from there, too. These are
> serialised by the inode->i_rwsem, not the mapping->invalidate_lock.

Got it, will make generic helpers for the scenario where only
dax-break-layouts needs to be performed.
Jason Gunthorpe Sept. 21, 2022, 10:28 p.m. UTC | #3
On Thu, Sep 22, 2022 at 08:14:16AM +1000, Dave Chinner wrote:

> Where are these DAX page pins that don't require the pin holder to
> also hold active references to the filesystem objects coming from?

O_DIRECT and things like it.

The concept has been that what you called revoke is just generic code
to wait until all short term users put back their pins, as short term
uses must do according to the !FOLL_LONGTERM contract.

Jason
Dan Williams Sept. 22, 2022, 12:02 a.m. UTC | #4
Dave Chinner wrote:
> On Tue, Sep 20, 2022 at 09:44:52AM -0700, Dan Williams wrote:
> > Dave Chinner wrote:
> > > On Mon, Sep 19, 2022 at 09:11:48AM -0700, Dan Williams wrote:
> > > > Dave Chinner wrote:
> > > > > That all said, this really looks like a bit of a band-aid.
> > > > 
> > > > It definitely is since DAX is in this transitory state between doing
> > > > some activities page-less and others with page metadata. If DAX was
> > > > fully committed to behaving like a typical page then
> > > > unmap_mapping_range() would have already satisfied this reference
> > > > counting situation.
> > > > 
> > > > > I can't work out why would we we ever have an actual layout lease
> > > > > here that needs breaking given they are file based and active files
> > > > > hold a reference to the inode. If we ever break that, then I suspect
> > > > > this change will cause major problems for anyone using pNFS with XFS
> > > > > as xfs_break_layouts() can end up waiting for NFS delegation
> > > > > revocation. This is something we should never be doing in inode
> > > > > eviction/memory reclaim.
> > > > > 
> > > > > Hence I have to ask why this lease break is being done
> > > > > unconditionally for all inodes, instead of only calling
> > > > > xfs_break_dax_layouts() directly on DAX enabled regular files?  I
> > > > > also wonder what exciting new system deadlocks this will create
> > > > > because BREAK_UNMAP_FINAL can essentially block forever waiting on
> > > > > dax mappings going away. If that DAX mapping reclaim requires memory
> > > > > allocations.....
> > > > 
> > > > There should be no memory allocations in the DAX mapping reclaim path.
> > > > Also, the page pins it waits for are precluded from being GUP_LONGTERM.
> > > 
> > > So if the task that holds the pin needs memory allocation before it
> > > can unpin the page to allow direct inode reclaim to make progress?
> > 
> > No, it couldn't, and I realize now that GUP_LONGTERM has nothing to do
> > with this hang since any GFP_KERNEL in a path that took a DAX page pin
> > path could run afoul of this need to wait.
> > 
> > So, this has me looking at invalidate_inodes() and iput_final(), where I
> > did not see the reclaim entanglement, and thinking DAX has the unique
> > requirement to make sure that no access to a page outlives the hosting
> > inode.
> > 
> > Not that I need to tell you, but to get my own thinking straight,
> > compare that to typical page cache as the pinner can keep a pinned
> > page-cache page as long as it wants even after it has been truncated.
> 
> Right, because the page pin prevents the page from being freed
> after the page references the page cache keeps have been released.
> 
> But page cache page != DAX page. The DAX page is a direct reference
> to the storage media, not a generic reference counted kernel page
> that the kernel will keep alive as long as there is a reference to
> it.
> 
> Hence for a DAX page, we have to revoke all access to the page
> before the controlling owner context is torn down, otherwise we have
> a use-after-free scenario at the storage media level. For a FSDAX
> file data page, that owner context is the inode...
> 
> > DAX needs to make sure that truncate_inode_pages() ceases all access to
> > the page synchronous with the truncate.
> 
> Yes, exactly.
> 
> >
> > The typical page-cache will
> > ensure that the next mapping of the file will get a new page if the page
> > previously pinned for that offset is still in use, DAX can not offer
> > that as the same page that was previously pinned is always used.
> 
> Yes, because the new DAX ipage lookup will return the original page
> in the storage media, not a newly instantiated page cache page.
> 
> > So I think this means something like this:
> > 
> > diff --git a/fs/inode.c b/fs/inode.c
> > index 6462276dfdf0..ab16772b9a8d 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -784,6 +784,11 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
> >                         continue;
> >                 }
> >  
> > +               if (dax_inode_busy(inode)) {
> > +                       busy = 1;
> > +                       continue;
> > +               }
> 
> That this does more than a check (i.e. it runs whatever
> dax_zap_pages() does) means it cannot be run under the inode
> spinlock.

Here lockdep did immediately scream at me for what can be done under the
inode lock.

> As this is called from the block device code when a bdev is being
> removed (i.e. will only find a superblock and inodes to invalidate
> on hot-unplug), shouldn't this DAX mapping invalidation actually be
> handled by the pmem failure notification infrastructure we've just
> added for reflink?

Perhaps. I think the patch I have in the works now is simpler without
having to require ext4 to add notify_failure infrastructure, but that
may be where this ends up.

> 
> > +
> >                 inode->i_state |= I_FREEING;
> >                 inode_lru_list_del(inode);
> >                 spin_unlock(&inode->i_lock);
> > @@ -1733,6 +1738,8 @@ static void iput_final(struct inode *inode)
> >                 spin_unlock(&inode->i_lock);
> >  
> >                 write_inode_now(inode, 1);
> > +               if (IS_DAX(inode))
> > +                       dax_break_layouts(inode);
> >  
> >                 spin_lock(&inode->i_lock);
> >                 state = inode->i_state;
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 9eced4cc286e..e4a74ab310b5 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -3028,8 +3028,20 @@ extern struct inode * igrab(struct inode *);
> >  extern ino_t iunique(struct super_block *, ino_t);
> >  extern int inode_needs_sync(struct inode *inode);
> >  extern int generic_delete_inode(struct inode *inode);
> > +
> > +static inline bool dax_inode_busy(struct inode *inode)
> > +{
> > +       if (!IS_DAX(inode))
> > +               return false;
> > +
> > +       return dax_zap_pages(inode) != NULL;
> > +}
> > +
> >  static inline int generic_drop_inode(struct inode *inode)
> >  {
> > +       if (dax_inode_busy(inode))
> > +               return 0;
> > +
> >         return !inode->i_nlink || inode_unhashed(inode);
> >  }
> 
> I don't think that's valid. This can result in unreferenced unlinked
> inodes that should be torn down immediately being placed in the LRU
> and cached in memory and potentially not processed until there is
> future memory pressure or an unmount....
> 
> i.e. dropping the final reference on an unlinked inode needs to
> reclaim the inode immediately and allow the filesystem to free the
> inode, regardless of any other factor. Nothing should have an active
> reference to the inode or inode related data/metadata at this point
> in time.
> 
> Honestly, this still seems like a band-aid because it doesn't appear
> to address that something has pinned the storage media without
> having an active reference to the object that arbitrates access to
> that storage media (i.e. the inode and, by proxy, then filesystem).
> Where are these DAX page pins that don't require the pin holder to
> also hold active references to the filesystem objects coming from?

I do not have a practical exploit for this only the observation that
iput_final() triggers truncate_inode_pages(). Then the follow-on
assumption that *if* pages are still recorded in inode->i_pages then
those pages could have an elevated reference count.

Certainly GUP_LONGTERM can set up these "memory registration" scenarios,
but those are forbidden.

The scenario I cannot convince myself is impossible is a driver that
goes into interruptible sleep while operating on a page it got from
get_user_pages(). Where the eventual driver completion path will clean
up the pinned page, but the process that launched the I/O has already
exited and dropped all the inode references it was holding. That's not
buggy on its face since the driver still cleans up everything it was
handed, but if this type of disconnect happens (closing mappings and
files while I/O is in-flight) then iput_final() needs to check.

The block-I/O submission path seems to be uninterruptible to prevent
this type of disconnect, but who knows what other drivers do with their
pages.
Jason Gunthorpe Sept. 22, 2022, 12:10 a.m. UTC | #5
On Wed, Sep 21, 2022 at 05:02:37PM -0700, Dan Williams wrote:

> The scenario I cannot convince myself is impossible is a driver that
> goes into interruptible sleep while operating on a page it got from
> get_user_pages(). Where the eventual driver completion path will clean
> up the pinned page, but the process that launched the I/O has already
> exited and dropped all the inode references it was holding. That's not
> buggy on its face since the driver still cleans up everything it was
> handed, but if this type of disconnect happens (closing mappings and
> files while I/O is in-flight) then iput_final() needs to check.

I don't think you can make this argument. The inode you are talking
about is held in the vma of the mm_struct, it is not just a process
exit or interrupted sleep that could cause the vma to drop the inode
reference, but any concurrent thread doing memunmap/close can destroy
the VMA, close the FD and release the inode.

So userspace can certainly create races where something has safely
done GUP/PUP !FOLL_LONGTERM but the VMA that sourced the page is
destroyed while the thread is still processing the post-GUP work.

Jason
Dan Williams Sept. 23, 2022, 12:41 a.m. UTC | #6
Dave Chinner wrote:
> On Wed, Sep 21, 2022 at 07:28:51PM -0300, Jason Gunthorpe wrote:
> > On Thu, Sep 22, 2022 at 08:14:16AM +1000, Dave Chinner wrote:
> > 
> > > Where are these DAX page pins that don't require the pin holder to
> > > also hold active references to the filesystem objects coming from?
> > 
> > O_DIRECT and things like it.
> 
> O_DIRECT IO to a file holds a reference to a struct file which holds
> an active reference to the struct inode. Hence you can't reclaim an
> inode while an O_DIRECT IO is in progress to it. 
> 
> Similarly, file-backed pages pinned from user vmas have the inode
> pinned by the VMA having a reference to the struct file passed to
> them when they are instantiated. Hence anything using mmap() to pin
> file-backed pages (i.e. applications using FSDAX access from
> userspace) should also have a reference to the inode that prevents
> the inode from being reclaimed.
> 
> So I'm at a loss to understand what "things like it" might actually
> mean. Can you actually describe a situation where we actually permit
> (even temporarily) these use-after-free scenarios?

Jason mentioned a scenario here:

https://lore.kernel.org/all/YyuoE8BgImRXVkkO@nvidia.com/

Multi-thread process where thread1 does open(O_DIRECT)+mmap()+read() and
thread2 does memunmap()+close() while the read() is inflight.

Sounds plausible to me, but I have not tried to trigger it with a focus
test.
Jan Kara Sept. 23, 2022, 9:38 a.m. UTC | #7
On Fri 23-09-22 12:10:12, Dave Chinner wrote:
> On Thu, Sep 22, 2022 at 05:41:08PM -0700, Dan Williams wrote:
> > Dave Chinner wrote:
> > > On Wed, Sep 21, 2022 at 07:28:51PM -0300, Jason Gunthorpe wrote:
> > > > On Thu, Sep 22, 2022 at 08:14:16AM +1000, Dave Chinner wrote:
> > > > 
> > > > > Where are these DAX page pins that don't require the pin holder to
> > > > > also hold active references to the filesystem objects coming from?
> > > > 
> > > > O_DIRECT and things like it.
> > > 
> > > O_DIRECT IO to a file holds a reference to a struct file which holds
> > > an active reference to the struct inode. Hence you can't reclaim an
> > > inode while an O_DIRECT IO is in progress to it. 
> > > 
> > > Similarly, file-backed pages pinned from user vmas have the inode
> > > pinned by the VMA having a reference to the struct file passed to
> > > them when they are instantiated. Hence anything using mmap() to pin
> > > file-backed pages (i.e. applications using FSDAX access from
> > > userspace) should also have a reference to the inode that prevents
> > > the inode from being reclaimed.
> > > 
> > > So I'm at a loss to understand what "things like it" might actually
> > > mean. Can you actually describe a situation where we actually permit
> > > (even temporarily) these use-after-free scenarios?
> > 
> > Jason mentioned a scenario here:
> > 
> > https://lore.kernel.org/all/YyuoE8BgImRXVkkO@nvidia.com/
> > 
> > Multi-thread process where thread1 does open(O_DIRECT)+mmap()+read() and
> > thread2 does memunmap()+close() while the read() is inflight.
> 
> And, ah, what production application does this and expects to be
> able to process the result of the read() operation without getting a
> SEGV?
> 
> There's a huge difference between an unlikely scenario which we need
> to work (such as O_DIRECT IO to/from a mmap() buffer at a different
> offset on the same file) and this sort of scenario where even if we
> handle it correctly, the application can't do anything with the
> result and will crash immediately....

I'm not sure I fully follow what we are concerned about here. As you've
written above direct IO holds reference to the inode until it is completed
(through kiocb->file->inode chain). So direct IO should be safe?

I'd be more worried about stuff like vmsplice() that can add file pages
into pipe without holding inode alive in any way and keeping them there for
arbitrarily long time. Didn't we want to add FOLL_LONGTERM to gup executed
from vmsplice() to avoid issues like this?

> > Sounds plausible to me, but I have not tried to trigger it with a focus
> > test.
> 
> If there really are applications this .... broken, then it's not the
> responsibility of the filesystem to paper over the low level page
> reference tracking issues that cause it.
> 
> i.e. The underlying problem here is that memunmap() frees the VMA
> while there are still active task-based references to the pages in
> that VMA. IOWs, the VMA should not be torn down until the O_DIRECT
> read has released all the references to the pages mapped into the
> task address space.
> 
> This just doesn't seem like an issue that we should be trying to fix
> by adding band-aids to the inode life-cycle management.

I agree that freeing VMA while there are pinned pages is ... inconvenient.
But that is just how gup works since the beginning - the moment you have
struct page reference, you completely forget about the mapping you've used
to get to the page. So anything can happen with the mapping after that
moment. And in case of pages mapped by multiple processes I can easily see
that one of the processes decides to unmap the page (and it may well be
that was the initial process that acquired page references) while others
still keep accessing the page using page references stored in some internal
structure (RDMA anyone?). I think it will be rather difficult to come up
with some scheme keeping VMA alive while there are pages pinned without
regressing userspace which over the years became very much tailored to the
peculiar gup behavior.

I can imagine we would keep *inode* referenced while there are its pages
pinned. That should not be that difficult but at least in naive
implementation that would put rather heavy stress on inode refcount under
some loads so I don't think that's useful either.

								Honza
Jason Gunthorpe Sept. 23, 2022, 12:39 p.m. UTC | #8
On Fri, Sep 23, 2022 at 12:10:12PM +1000, Dave Chinner wrote:

> > Jason mentioned a scenario here:
> > 
> > https://lore.kernel.org/all/YyuoE8BgImRXVkkO@nvidia.com/
> > 
> > Multi-thread process where thread1 does open(O_DIRECT)+mmap()+read() and
> > thread2 does memunmap()+close() while the read() is inflight.
> 
> And, ah, what production application does this and expects to be
> able to process the result of the read() operation without getting a
> SEGV?

The read() will do GUP and get a pined page, next the memunmap()/close
will release the inode the VMA was holding open. The read() FD is NOT
a DAX FD.

We are now UAFing the DAX storage. There is no SEGV.

It is not about sane applications, it is about kernel security against
hostile userspace.

> i.e. The underlying problem here is that memunmap() frees the VMA
> while there are still active task-based references to the pages in
> that VMA. IOWs, the VMA should not be torn down until the O_DIRECT
> read has released all the references to the pages mapped into the
> task address space.

This is Jan's suggestion, I think we are still far from being able to
do that for O_DIRECT paths.

Even if you fix the close() this way, doesn't truncate still have the
same problem?

At the end of the day the rule is a DAX page must not be re-used until
its refcount is 0. At some point the FS should wait for.

Jason
Dan Williams Sept. 23, 2022, 11:06 p.m. UTC | #9
Jan Kara wrote:
> On Fri 23-09-22 12:10:12, Dave Chinner wrote:
> > On Thu, Sep 22, 2022 at 05:41:08PM -0700, Dan Williams wrote:
> > > Dave Chinner wrote:
> > > > On Wed, Sep 21, 2022 at 07:28:51PM -0300, Jason Gunthorpe wrote:
> > > > > On Thu, Sep 22, 2022 at 08:14:16AM +1000, Dave Chinner wrote:
> > > > > 
> > > > > > Where are these DAX page pins that don't require the pin holder to
> > > > > > also hold active references to the filesystem objects coming from?
> > > > > 
> > > > > O_DIRECT and things like it.
> > > > 
> > > > O_DIRECT IO to a file holds a reference to a struct file which holds
> > > > an active reference to the struct inode. Hence you can't reclaim an
> > > > inode while an O_DIRECT IO is in progress to it. 
> > > > 
> > > > Similarly, file-backed pages pinned from user vmas have the inode
> > > > pinned by the VMA having a reference to the struct file passed to
> > > > them when they are instantiated. Hence anything using mmap() to pin
> > > > file-backed pages (i.e. applications using FSDAX access from
> > > > userspace) should also have a reference to the inode that prevents
> > > > the inode from being reclaimed.
> > > > 
> > > > So I'm at a loss to understand what "things like it" might actually
> > > > mean. Can you actually describe a situation where we actually permit
> > > > (even temporarily) these use-after-free scenarios?
> > > 
> > > Jason mentioned a scenario here:
> > > 
> > > https://lore.kernel.org/all/YyuoE8BgImRXVkkO@nvidia.com/
> > > 
> > > Multi-thread process where thread1 does open(O_DIRECT)+mmap()+read() and
> > > thread2 does memunmap()+close() while the read() is inflight.
> > 
> > And, ah, what production application does this and expects to be
> > able to process the result of the read() operation without getting a
> > SEGV?
> > 
> > There's a huge difference between an unlikely scenario which we need
> > to work (such as O_DIRECT IO to/from a mmap() buffer at a different
> > offset on the same file) and this sort of scenario where even if we
> > handle it correctly, the application can't do anything with the
> > result and will crash immediately....
> 
> I'm not sure I fully follow what we are concerned about here. As you've
> written above direct IO holds reference to the inode until it is completed
> (through kiocb->file->inode chain). So direct IO should be safe?
> 
> I'd be more worried about stuff like vmsplice() that can add file pages
> into pipe without holding inode alive in any way and keeping them there for
> arbitrarily long time. Didn't we want to add FOLL_LONGTERM to gup executed
> from vmsplice() to avoid issues like this?
> 
> > > Sounds plausible to me, but I have not tried to trigger it with a focus
> > > test.
> > 
> > If there really are applications this .... broken, then it's not the
> > responsibility of the filesystem to paper over the low level page
> > reference tracking issues that cause it.
> > 
> > i.e. The underlying problem here is that memunmap() frees the VMA
> > while there are still active task-based references to the pages in
> > that VMA. IOWs, the VMA should not be torn down until the O_DIRECT
> > read has released all the references to the pages mapped into the
> > task address space.
> > 
> > This just doesn't seem like an issue that we should be trying to fix
> > by adding band-aids to the inode life-cycle management.
> 
> I agree that freeing VMA while there are pinned pages is ... inconvenient.
> But that is just how gup works since the beginning - the moment you have
> struct page reference, you completely forget about the mapping you've used
> to get to the page. So anything can happen with the mapping after that
> moment. And in case of pages mapped by multiple processes I can easily see
> that one of the processes decides to unmap the page (and it may well be
> that was the initial process that acquired page references) while others
> still keep accessing the page using page references stored in some internal
> structure (RDMA anyone?). I think it will be rather difficult to come up
> with some scheme keeping VMA alive while there are pages pinned without
> regressing userspace which over the years became very much tailored to the
> peculiar gup behavior.
> 
> I can imagine we would keep *inode* referenced while there are its pages
> pinned. That should not be that difficult but at least in naive
> implementation that would put rather heavy stress on inode refcount under
> some loads so I don't think that's useful either.

What about instead of keeping the inode *referenced* while there might
be pinned pages, keep the inode *dirty*? Then
dax_writeback_mapping_range() can watch for inodes in the I_WILL_FREE
state and know this is the last chance to break layouts and truncate
mappings before the inode goes out of scope.

That feels clean and unburdensome to me, but this would not be the first
time I have overlooked a filesystem constraint.
Jason Gunthorpe Sept. 26, 2022, 1:04 p.m. UTC | #10
On Mon, Sep 26, 2022 at 10:34:30AM +1000, Dave Chinner wrote:

> > It is not about sane applications, it is about kernel security against
> > hostile userspace.
> 
> Turning this into a UAF doesn't provide any security at all. It
> makes this measurable worse from a security POV as it provides a
> channel for data leakage (read() case) or system unstability or
> compromise (the write() case).

You asked what the concern is, I think you get it, you explained it to
Jan in another email.

We have this issue where if we are not careful we can create a UAF bug
through GUP. It is not something a real application will hit, this is
kernel self-protection against hostile user space trying to trigger a
UAF. The issue arises from both the FS and the MM having their own
lifecycle models for the same memory page.

I'm still not clear on exactly what the current state of affairs is,
Dan?

The DAX/FSDAX stuff currently has a wait on the struct page - does
that wait protect against these UAFs? It looks to me like that is what
it is suppposed to do?

If so, that wait simply needs to be transformed into a wait for the
refcount to be 0 when you rework the refcounting. 

This is not the same FOLL_LONGTERM discussion rehashed, all the
FOLL_LONGTERM discussions were predicated on the idea that GUP
actually worked and doesn't have UAF bugs.

> The *storage media* must not be reused until the filesystem says it
> can be reused. And for that to work, nothing is allowed to keep an
> anonymous, non-filesystem reference to the storage media. It has
> nothing to do with struct page reference counts, and everything to
> do with ensuring that filesystem objects are correctly referenced
> while the storage media is in direct use by an application.

The trouble is we have *two* things that think they own the media -
the mm through pgmap clearly is the owner of the struct page and has
its own well defined lifecycle model for it.

And the FS has its model. We have to ensure the two models are tied
together, a page in the media cannot be considered free until both
lifecycle models agree it is free.

This is a side effect of using the struct pages in the first place,
currently the FS can't use struct page but opt out of the mm's
lifecycle model for struct page!

If we want the FS to own everything exclusively we should purge the
struct pages completely and give up all the features that come with
them (like GUP)

Jason
Jan Kara Sept. 26, 2022, 2:10 p.m. UTC | #11
On Mon 26-09-22 09:54:07, Dave Chinner wrote:
> On Fri, Sep 23, 2022 at 11:38:03AM +0200, Jan Kara wrote:
> > On Fri 23-09-22 12:10:12, Dave Chinner wrote:
> > > On Thu, Sep 22, 2022 at 05:41:08PM -0700, Dan Williams wrote:
> > > > Dave Chinner wrote:
> > > > > On Wed, Sep 21, 2022 at 07:28:51PM -0300, Jason Gunthorpe wrote:
> > > > > > On Thu, Sep 22, 2022 at 08:14:16AM +1000, Dave Chinner wrote:
> > > > > > 
> > > > > > > Where are these DAX page pins that don't require the pin holder to
> > > > > > > also hold active references to the filesystem objects coming from?
> > > > > > 
> > > > > > O_DIRECT and things like it.
> > > > > 
> > > > > O_DIRECT IO to a file holds a reference to a struct file which holds
> > > > > an active reference to the struct inode. Hence you can't reclaim an
> > > > > inode while an O_DIRECT IO is in progress to it. 
> > > > > 
> > > > > Similarly, file-backed pages pinned from user vmas have the inode
> > > > > pinned by the VMA having a reference to the struct file passed to
> > > > > them when they are instantiated. Hence anything using mmap() to pin
> > > > > file-backed pages (i.e. applications using FSDAX access from
> > > > > userspace) should also have a reference to the inode that prevents
> > > > > the inode from being reclaimed.
> > > > > 
> > > > > So I'm at a loss to understand what "things like it" might actually
> > > > > mean. Can you actually describe a situation where we actually permit
> > > > > (even temporarily) these use-after-free scenarios?
> > > > 
> > > > Jason mentioned a scenario here:
> > > > 
> > > > https://lore.kernel.org/all/YyuoE8BgImRXVkkO@nvidia.com/
> > > > 
> > > > Multi-thread process where thread1 does open(O_DIRECT)+mmap()+read() and
> > > > thread2 does memunmap()+close() while the read() is inflight.
> > > 
> > > And, ah, what production application does this and expects to be
> > > able to process the result of the read() operation without getting a
> > > SEGV?
> > > 
> > > There's a huge difference between an unlikely scenario which we need
> > > to work (such as O_DIRECT IO to/from a mmap() buffer at a different
> > > offset on the same file) and this sort of scenario where even if we
> > > handle it correctly, the application can't do anything with the
> > > result and will crash immediately....
> > 
> > I'm not sure I fully follow what we are concerned about here. As you've
> > written above direct IO holds reference to the inode until it is completed
> > (through kiocb->file->inode chain). So direct IO should be safe?
> 
> AFAICT, it's the user buffer allocated by mmap() that the direct IO
> is DMAing into/out of that is the issue here. i.e. mmap() a file
> that is DAX enabled, pass the mmap region to DIO on a non-dax file,
> GUP in the DIO path takes a page pin on user pages that are DAX
> mapped, the userspace application then unmaps the file pages and
> unlinks the FSDAX file.
> 
> At this point the FSDAX mapped inode has no active references, so
> the filesystem frees the inode and it's allocated storage space, and
> now the DIO or whatever is holding the GUP reference is
> now a moving storage UAF violation. What ever is holding the GUP
> reference doesn't even have a reference to the FSDAX filesystem -
> the DIO fd could point to a file in a different filesystem
> altogether - and so the fsdax filesytem could be unmounted at this
> point whilst the application is still actively using the storage
> underlying the filesystem.
> 
> That's just .... broken.

Hum, so I'm confused (and my last email probably was as well). So let me
spell out the details here so that I can get on the same page about what we
are trying to solve:

For FSDAX, backing storage for a page must not be freed (i.e., filesystem
must to free corresponding block) while there are some references to the
page. This is achieved by calls to dax_layout_busy_page() from the
filesystem before truncating file / punching hole into a file. So AFAICT
this is working correctly and I don't think the patch series under
discussion aims to change this besides the change in how page without
references is detected.

Now there is a separate question that while someone holds a reference to
FSDAX page, the inode this page belongs to can get evicted from memory. For
FSDAX nothing prevents that AFAICT. If this happens, we loose track of the
page<->inode association so if somebody later comes and truncates the
inode, we will not detect the page belonging to the inode is still in use
(dax_layout_busy_page() does not find the page) and we have a problem.
Correct?

> > I'd be more worried about stuff like vmsplice() that can add file pages
> > into pipe without holding inode alive in any way and keeping them there for
> > arbitrarily long time. Didn't we want to add FOLL_LONGTERM to gup executed
> > from vmsplice() to avoid issues like this?
> 
> Yes, ISTR that was part of the plan - use FOLL_LONGTERM to ensure
> FSDAX can't run operations that pin pages but don't take fs
> references. I think that's how we prevented RDMA users from pinning
> FSDAX direct mapped storage media in this way. It does not, however,
> prevent the above "short term" GUP UAF situation from occurring.

If what I wrote above is correct, then I understand and agree.

> > I agree that freeing VMA while there are pinned pages is ... inconvenient.
> > But that is just how gup works since the beginning - the moment you have
> > struct page reference, you completely forget about the mapping you've used
> > to get to the page. So anything can happen with the mapping after that
> > moment. And in case of pages mapped by multiple processes I can easily see
> > that one of the processes decides to unmap the page (and it may well be
> > that was the initial process that acquired page references) while others
> > still keep accessing the page using page references stored in some internal
> > structure (RDMA anyone?).
> 
> Yup, and this is why RDMA on FSDAX using this method of pinning pages
> will end up corrupting data and filesystems, hence FOLL_LONGTERM
> protecting against most of these situations from even arising. But
> that's that workaround, not a long term solution that allows RDMA to
> be run on FSDAX managed storage media.
> 
> I said on #xfs a few days ago:
> 
> [23/9/22 10:23] * dchinner is getting deja vu over this latest round
> of "dax mappings don't pin the filesystem objects that own the
> storage media being mapped"
> 
> And I'm getting that feeling again right now...
> 
> > I think it will be rather difficult to come up
> > with some scheme keeping VMA alive while there are pages pinned without
> > regressing userspace which over the years became very much tailored to the
> > peculiar gup behavior.
> 
> Perhaps all we should do is add a page flag for fsdax mapped pages
> that says GUP must pin the VMA, so only mapped pages that fall into
> this category take the perf penalty of VMA management.

Possibly. But my concern with VMA pinning was not only about performance
but also about applications relying on being able to unmap pages that are
currently pinned. At least from some processes one of which may be the one
doing the original pinning. But yeah, the fact that FOLL_LONGTERM is
forbidden with DAX somewhat restricts the insanity we have to deal with. So
maybe pinning the VMA for DAX mappings might actually be a workable
solution.

								Honza
Dan Williams Sept. 29, 2022, 11:33 p.m. UTC | #12
Jan Kara wrote:
> On Mon 26-09-22 09:54:07, Dave Chinner wrote:
> > On Fri, Sep 23, 2022 at 11:38:03AM +0200, Jan Kara wrote:
> > > On Fri 23-09-22 12:10:12, Dave Chinner wrote:
> > > > On Thu, Sep 22, 2022 at 05:41:08PM -0700, Dan Williams wrote:
> > > > > Dave Chinner wrote:
> > > > > > On Wed, Sep 21, 2022 at 07:28:51PM -0300, Jason Gunthorpe wrote:
> > > > > > > On Thu, Sep 22, 2022 at 08:14:16AM +1000, Dave Chinner wrote:
> > > > > > > 
> > > > > > > > Where are these DAX page pins that don't require the pin holder to
> > > > > > > > also hold active references to the filesystem objects coming from?
> > > > > > > 
> > > > > > > O_DIRECT and things like it.
> > > > > > 
> > > > > > O_DIRECT IO to a file holds a reference to a struct file which holds
> > > > > > an active reference to the struct inode. Hence you can't reclaim an
> > > > > > inode while an O_DIRECT IO is in progress to it. 
> > > > > > 
> > > > > > Similarly, file-backed pages pinned from user vmas have the inode
> > > > > > pinned by the VMA having a reference to the struct file passed to
> > > > > > them when they are instantiated. Hence anything using mmap() to pin
> > > > > > file-backed pages (i.e. applications using FSDAX access from
> > > > > > userspace) should also have a reference to the inode that prevents
> > > > > > the inode from being reclaimed.
> > > > > > 
> > > > > > So I'm at a loss to understand what "things like it" might actually
> > > > > > mean. Can you actually describe a situation where we actually permit
> > > > > > (even temporarily) these use-after-free scenarios?
> > > > > 
> > > > > Jason mentioned a scenario here:
> > > > > 
> > > > > https://lore.kernel.org/all/YyuoE8BgImRXVkkO@nvidia.com/
> > > > > 
> > > > > Multi-thread process where thread1 does open(O_DIRECT)+mmap()+read() and
> > > > > thread2 does memunmap()+close() while the read() is inflight.
> > > > 
> > > > And, ah, what production application does this and expects to be
> > > > able to process the result of the read() operation without getting a
> > > > SEGV?
> > > > 
> > > > There's a huge difference between an unlikely scenario which we need
> > > > to work (such as O_DIRECT IO to/from a mmap() buffer at a different
> > > > offset on the same file) and this sort of scenario where even if we
> > > > handle it correctly, the application can't do anything with the
> > > > result and will crash immediately....
> > > 
> > > I'm not sure I fully follow what we are concerned about here. As you've
> > > written above direct IO holds reference to the inode until it is completed
> > > (through kiocb->file->inode chain). So direct IO should be safe?
> > 
> > AFAICT, it's the user buffer allocated by mmap() that the direct IO
> > is DMAing into/out of that is the issue here. i.e. mmap() a file
> > that is DAX enabled, pass the mmap region to DIO on a non-dax file,
> > GUP in the DIO path takes a page pin on user pages that are DAX
> > mapped, the userspace application then unmaps the file pages and
> > unlinks the FSDAX file.
> > 
> > At this point the FSDAX mapped inode has no active references, so
> > the filesystem frees the inode and it's allocated storage space, and
> > now the DIO or whatever is holding the GUP reference is
> > now a moving storage UAF violation. What ever is holding the GUP
> > reference doesn't even have a reference to the FSDAX filesystem -
> > the DIO fd could point to a file in a different filesystem
> > altogether - and so the fsdax filesytem could be unmounted at this
> > point whilst the application is still actively using the storage
> > underlying the filesystem.
> > 
> > That's just .... broken.
> 
> Hum, so I'm confused (and my last email probably was as well). So let me
> spell out the details here so that I can get on the same page about what we
> are trying to solve:
> 
> For FSDAX, backing storage for a page must not be freed (i.e., filesystem
> must to free corresponding block) while there are some references to the
> page. This is achieved by calls to dax_layout_busy_page() from the
> filesystem before truncating file / punching hole into a file. So AFAICT
> this is working correctly and I don't think the patch series under
> discussion aims to change this besides the change in how page without
> references is detected.

Correct. All the nominal truncate paths via hole punch and
truncate_setsize() are already handled for a long time now. However,
what was not covered was the truncate that happens at iput_final() time.
In that case the code has just been getting lucky for all that time.
There is thankfully a WARN() that will trigger if the iput_final()
truncate happens while a page is referenced, so it is at least not
silent.

I know Dave is tired of this discussion, but every time he engages the
solution gets better, like finding this iput_final() bug, so I hope he
continues to engage here and I'm grateful for the help.

> Now there is a separate question that while someone holds a reference to
> FSDAX page, the inode this page belongs to can get evicted from memory. For
> FSDAX nothing prevents that AFAICT. If this happens, we loose track of the
> page<->inode association so if somebody later comes and truncates the
> inode, we will not detect the page belonging to the inode is still in use
> (dax_layout_busy_page() does not find the page) and we have a problem.
> Correct?

The WARN would fire at iput_final(). Everything that happens after
that is in UAF territory. In my brief search I did not see reports of
this WARN firing, but it is past time to fix it.

> > > I'd be more worried about stuff like vmsplice() that can add file pages
> > > into pipe without holding inode alive in any way and keeping them there for
> > > arbitrarily long time. Didn't we want to add FOLL_LONGTERM to gup executed
> > > from vmsplice() to avoid issues like this?
> > 
> > Yes, ISTR that was part of the plan - use FOLL_LONGTERM to ensure
> > FSDAX can't run operations that pin pages but don't take fs
> > references. I think that's how we prevented RDMA users from pinning
> > FSDAX direct mapped storage media in this way. It does not, however,
> > prevent the above "short term" GUP UAF situation from occurring.
> 
> If what I wrote above is correct, then I understand and agree.
> 
> > > I agree that freeing VMA while there are pinned pages is ... inconvenient.
> > > But that is just how gup works since the beginning - the moment you have
> > > struct page reference, you completely forget about the mapping you've used
> > > to get to the page. So anything can happen with the mapping after that
> > > moment. And in case of pages mapped by multiple processes I can easily see
> > > that one of the processes decides to unmap the page (and it may well be
> > > that was the initial process that acquired page references) while others
> > > still keep accessing the page using page references stored in some internal
> > > structure (RDMA anyone?).
> > 
> > Yup, and this is why RDMA on FSDAX using this method of pinning pages
> > will end up corrupting data and filesystems, hence FOLL_LONGTERM
> > protecting against most of these situations from even arising. But
> > that's that workaround, not a long term solution that allows RDMA to
> > be run on FSDAX managed storage media.
> > 
> > I said on #xfs a few days ago:
> > 
> > [23/9/22 10:23] * dchinner is getting deja vu over this latest round
> > of "dax mappings don't pin the filesystem objects that own the
> > storage media being mapped"
> > 
> > And I'm getting that feeling again right now...
> > 
> > > I think it will be rather difficult to come up
> > > with some scheme keeping VMA alive while there are pages pinned without
> > > regressing userspace which over the years became very much tailored to the
> > > peculiar gup behavior.
> > 
> > Perhaps all we should do is add a page flag for fsdax mapped pages
> > that says GUP must pin the VMA, so only mapped pages that fall into
> > this category take the perf penalty of VMA management.
> 
> Possibly. But my concern with VMA pinning was not only about performance
> but also about applications relying on being able to unmap pages that are
> currently pinned. At least from some processes one of which may be the one
> doing the original pinning. But yeah, the fact that FOLL_LONGTERM is
> forbidden with DAX somewhat restricts the insanity we have to deal with. So
> maybe pinning the VMA for DAX mappings might actually be a workable
> solution.

As far as I can see, VMAs are not currently reference counted they are
just added / deleted from an mm_struct, and nothing guarantees
mapping_mapped() stays true while a page is pinned.

I like Dave's mental model that the inode is the arbiter for the page,
and the arbiter is not allowed to go out of scope before asserting that
everything it granted previously has been returned.

write_inode_now() unconditionally invokes dax_writeback_mapping_range()
when the inode is committed to going out of scope. write_inode_now() is
allowed to sleep until all dirty mapping entries are written back. I see
nothing wrong with additionally checking for entries with elevated page
reference counts and doing a:

    __wait_var_event(page, dax_page_idle(page));

Since the inode is out of scope there should be no concerns with racing
new 0 -> 1 page->_refcount transitions. Just wait for transient page
pins to finally drain to zero which should already be on the order of
the wait time to complete synchrounous writeback in the dirty inode
case.
Jan Kara Sept. 30, 2022, 1:41 p.m. UTC | #13
On Thu 29-09-22 16:33:27, Dan Williams wrote:
> Jan Kara wrote:
> > On Mon 26-09-22 09:54:07, Dave Chinner wrote:
> > > > I'd be more worried about stuff like vmsplice() that can add file pages
> > > > into pipe without holding inode alive in any way and keeping them there for
> > > > arbitrarily long time. Didn't we want to add FOLL_LONGTERM to gup executed
> > > > from vmsplice() to avoid issues like this?
> > > 
> > > Yes, ISTR that was part of the plan - use FOLL_LONGTERM to ensure
> > > FSDAX can't run operations that pin pages but don't take fs
> > > references. I think that's how we prevented RDMA users from pinning
> > > FSDAX direct mapped storage media in this way. It does not, however,
> > > prevent the above "short term" GUP UAF situation from occurring.
> > 
> > If what I wrote above is correct, then I understand and agree.
> > 
> > > > I agree that freeing VMA while there are pinned pages is ... inconvenient.
> > > > But that is just how gup works since the beginning - the moment you have
> > > > struct page reference, you completely forget about the mapping you've used
> > > > to get to the page. So anything can happen with the mapping after that
> > > > moment. And in case of pages mapped by multiple processes I can easily see
> > > > that one of the processes decides to unmap the page (and it may well be
> > > > that was the initial process that acquired page references) while others
> > > > still keep accessing the page using page references stored in some internal
> > > > structure (RDMA anyone?).
> > > 
> > > Yup, and this is why RDMA on FSDAX using this method of pinning pages
> > > will end up corrupting data and filesystems, hence FOLL_LONGTERM
> > > protecting against most of these situations from even arising. But
> > > that's that workaround, not a long term solution that allows RDMA to
> > > be run on FSDAX managed storage media.
> > > 
> > > I said on #xfs a few days ago:
> > > 
> > > [23/9/22 10:23] * dchinner is getting deja vu over this latest round
> > > of "dax mappings don't pin the filesystem objects that own the
> > > storage media being mapped"
> > > 
> > > And I'm getting that feeling again right now...
> > > 
> > > > I think it will be rather difficult to come up
> > > > with some scheme keeping VMA alive while there are pages pinned without
> > > > regressing userspace which over the years became very much tailored to the
> > > > peculiar gup behavior.
> > > 
> > > Perhaps all we should do is add a page flag for fsdax mapped pages
> > > that says GUP must pin the VMA, so only mapped pages that fall into
> > > this category take the perf penalty of VMA management.
> > 
> > Possibly. But my concern with VMA pinning was not only about performance
> > but also about applications relying on being able to unmap pages that are
> > currently pinned. At least from some processes one of which may be the one
> > doing the original pinning. But yeah, the fact that FOLL_LONGTERM is
> > forbidden with DAX somewhat restricts the insanity we have to deal with. So
> > maybe pinning the VMA for DAX mappings might actually be a workable
> > solution.
> 
> As far as I can see, VMAs are not currently reference counted they are
> just added / deleted from an mm_struct, and nothing guarantees
> mapping_mapped() stays true while a page is pinned.

I agree this solution requires quite some work. But I wanted to say that
in principle it would be a logically consistent and technically not that
difficult solution.
 
> I like Dave's mental model that the inode is the arbiter for the page,
> and the arbiter is not allowed to go out of scope before asserting that
> everything it granted previously has been returned.
> 
> write_inode_now() unconditionally invokes dax_writeback_mapping_range()
> when the inode is committed to going out of scope. write_inode_now() is
> allowed to sleep until all dirty mapping entries are written back. I see
> nothing wrong with additionally checking for entries with elevated page
> reference counts and doing a:
> 
>     __wait_var_event(page, dax_page_idle(page));
> 
> Since the inode is out of scope there should be no concerns with racing
> new 0 -> 1 page->_refcount transitions. Just wait for transient page
> pins to finally drain to zero which should already be on the order of
> the wait time to complete synchrounous writeback in the dirty inode
> case.

I agree this is doable but there's the nasty sideeffect that inode reclaim
may block for abitrary time waiting for page pinning. If the application
that has pinned the page requires __GFP_FS memory allocation to get to a
point where it releases the page, we even have a deadlock possibility.
So it's better than the UAF issue but still not ideal.

								Honza
Dan Williams Sept. 30, 2022, 5:56 p.m. UTC | #14
Jan Kara wrote:
[..]
> I agree this is doable but there's the nasty sideeffect that inode reclaim
> may block for abitrary time waiting for page pinning. If the application
> that has pinned the page requires __GFP_FS memory allocation to get to a
> point where it releases the page, we even have a deadlock possibility.
> So it's better than the UAF issue but still not ideal.

I expect VMA pinning would have similar deadlock exposure if pinning a
VMA keeps the inode allocated. Anything that puts a page-pin release
dependency in the inode freeing path can potentially deadlock a reclaim
event that depends on that inode being freed.

As you say the UAF is worse. I am not too worried about the deadlock
case for a couple reasons:

1/ There are no reports I can find of iput_final() triggering the WARN
that validates that truncate_inode_pages_final() is called while all
associated pages are unpinned. That WARN has been in place since 2017:

d2c997c0f145 fs, dax: use page->mapping to warn if truncate collides with a busy page

2/ It is bad form for I/O drivers to perform __GFP_FS and __GFP_IO
allocations in their fast paths. So while the deadlock is not impossible
it is unlikely with the major producers of transient page pin events.

My hope, famous last words, is that this is only a theoretical deadlock,
or we can handle this with targeted driver fixes. Any driver that thinks
it wants to pin pages and then do more allocations that recurse into the
FS likely wants to get that out of its fast path anyway. I will also
take a look at a lockdep annotation for the wait event to see if that
can give an early warning versus fs_reclaim_acquire().
Jason Gunthorpe Sept. 30, 2022, 6:06 p.m. UTC | #15
On Fri, Sep 30, 2022 at 10:56:27AM -0700, Dan Williams wrote:
> Jan Kara wrote:
> [..]
> > I agree this is doable but there's the nasty sideeffect that inode reclaim
> > may block for abitrary time waiting for page pinning. If the application
> > that has pinned the page requires __GFP_FS memory allocation to get to a
> > point where it releases the page, we even have a deadlock possibility.
> > So it's better than the UAF issue but still not ideal.
> 
> I expect VMA pinning would have similar deadlock exposure if pinning a
> VMA keeps the inode allocated. Anything that puts a page-pin release
> dependency in the inode freeing path can potentially deadlock a reclaim
> event that depends on that inode being freed.

I think the desire would be to go from the VMA to an inode_get and
hold the inode reference for the from the pin_user_pages() to the
unpin_user_page(), ie prevent it from being freed in the first place.

It is a fine idea, the trouble is just the high complexity to get
there.

However, I wonder if the trucate/hole punch paths have the same
deadlock problem?

I agree with you though, given the limited options we should convert
the UAF into an unlikely deadlock.

Jason
Dan Williams Sept. 30, 2022, 6:46 p.m. UTC | #16
Jason Gunthorpe wrote:
> On Fri, Sep 30, 2022 at 10:56:27AM -0700, Dan Williams wrote:
> > Jan Kara wrote:
> > [..]
> > > I agree this is doable but there's the nasty sideeffect that inode reclaim
> > > may block for abitrary time waiting for page pinning. If the application
> > > that has pinned the page requires __GFP_FS memory allocation to get to a
> > > point where it releases the page, we even have a deadlock possibility.
> > > So it's better than the UAF issue but still not ideal.
> > 
> > I expect VMA pinning would have similar deadlock exposure if pinning a
> > VMA keeps the inode allocated. Anything that puts a page-pin release
> > dependency in the inode freeing path can potentially deadlock a reclaim
> > event that depends on that inode being freed.
> 
> I think the desire would be to go from the VMA to an inode_get and
> hold the inode reference for the from the pin_user_pages() to the
> unpin_user_page(), ie prevent it from being freed in the first place.
> 
> It is a fine idea, the trouble is just the high complexity to get
> there.
> 
> However, I wonder if the trucate/hole punch paths have the same
> deadlock problem?

If the deadlock is waiting for inode reclaim to complete then I can see
why the VMA pin proposal and the current truncate paths do not trigger
that deadlock because the inode is kept out of the reclaim path.

> I agree with you though, given the limited options we should convert
> the UAF into an unlikely deadlock.

I think this approach makes the implementation incrementally better, and
that the need to plumb VMA pinning can await evidence that a driver
actually does this *and* the driver can not be fixed.
Jan Kara Oct. 3, 2022, 7:55 a.m. UTC | #17
On Fri 30-09-22 15:06:47, Jason Gunthorpe wrote:
> On Fri, Sep 30, 2022 at 10:56:27AM -0700, Dan Williams wrote:
> > Jan Kara wrote:
> > [..]
> > > I agree this is doable but there's the nasty sideeffect that inode reclaim
> > > may block for abitrary time waiting for page pinning. If the application
> > > that has pinned the page requires __GFP_FS memory allocation to get to a
> > > point where it releases the page, we even have a deadlock possibility.
> > > So it's better than the UAF issue but still not ideal.
> > 
> > I expect VMA pinning would have similar deadlock exposure if pinning a
> > VMA keeps the inode allocated. Anything that puts a page-pin release
> > dependency in the inode freeing path can potentially deadlock a reclaim
> > event that depends on that inode being freed.
> 
> I think the desire would be to go from the VMA to an inode_get and
> hold the inode reference for the from the pin_user_pages() to the
> unpin_user_page(), ie prevent it from being freed in the first place.

Yes, that was the idea how to avoid UAF problems.

> It is a fine idea, the trouble is just the high complexity to get
> there.
> 
> However, I wonder if the trucate/hole punch paths have the same
> deadlock problem?

Do you mean someone requiring say truncate(2) to complete on file F in
order to unpin pages of F? That is certainly a deadlock but it has always
worked this way for DAX so at least applications knowingly targetted at DAX
will quickly notice and avoid such unwise dependency ;).

								Honza
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 556e28d06788..d3ff692d5546 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -816,7 +816,8 @@  xfs_wait_dax_page(
 int
 xfs_break_dax_layouts(
 	struct inode		*inode,
-	bool			*retry)
+	bool			*retry,
+	int			state)
 {
 	struct page		*page;
 
@@ -827,8 +828,8 @@  xfs_break_dax_layouts(
 		return 0;
 
 	*retry = true;
-	return ___wait_var_event(page, dax_page_idle(page), TASK_INTERRUPTIBLE,
-				 0, 0, xfs_wait_dax_page(inode));
+	return ___wait_var_event(page, dax_page_idle(page), state, 0, 0,
+				 xfs_wait_dax_page(inode));
 }
 
 int
@@ -839,14 +840,18 @@  xfs_break_layouts(
 {
 	bool			retry;
 	int			error;
+	int			state = TASK_INTERRUPTIBLE;
 
 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 
 	do {
 		retry = false;
 		switch (reason) {
+		case BREAK_UNMAP_FINAL:
+			state = TASK_UNINTERRUPTIBLE;
+			fallthrough;
 		case BREAK_UNMAP:
-			error = xfs_break_dax_layouts(inode, &retry);
+			error = xfs_break_dax_layouts(inode, &retry, state);
 			if (error || retry)
 				break;
 			fallthrough;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 28493c8e9bb2..72ce1cb72736 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3452,6 +3452,7 @@  xfs_mmaplock_two_inodes_and_break_dax_layout(
 	struct xfs_inode	*ip1,
 	struct xfs_inode	*ip2)
 {
+	int			state = TASK_INTERRUPTIBLE;
 	int			error;
 	bool			retry;
 	struct page		*page;
@@ -3463,7 +3464,7 @@  xfs_mmaplock_two_inodes_and_break_dax_layout(
 	retry = false;
 	/* Lock the first inode */
 	xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
-	error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
+	error = xfs_break_dax_layouts(VFS_I(ip1), &retry, state);
 	if (error || retry) {
 		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
 		if (error == 0 && retry)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fa780f08dc89..e4994eb6e521 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -454,11 +454,13 @@  static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
  * layout-holder has a consistent view of the file's extent map. While
  * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
  * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
- * go idle.
+ * go idle. BREAK_UNMAP_FINAL is an uninterruptible version of
+ * BREAK_UNMAP.
  */
 enum layout_break_reason {
         BREAK_WRITE,
         BREAK_UNMAP,
+        BREAK_UNMAP_FINAL,
 };
 
 /*
@@ -531,7 +533,7 @@  xfs_itruncate_extents(
 }
 
 /* from xfs_file.c */
-int	xfs_break_dax_layouts(struct inode *inode, bool *retry);
+int	xfs_break_dax_layouts(struct inode *inode, bool *retry, int state);
 int	xfs_break_layouts(struct inode *inode, uint *iolock,
 		enum layout_break_reason reason);
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9ac59814bbb6..ebb4a6eba3fc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -725,6 +725,27 @@  xfs_fs_drop_inode(
 	return generic_drop_inode(inode);
 }
 
+STATIC void
+xfs_fs_evict_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	long			error;
+
+	xfs_ilock(ip, iolock);
+
+	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP_FINAL);
+
+	/* The final layout break is uninterruptible */
+	ASSERT_ALWAYS(!error);
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	xfs_iunlock(ip, iolock);
+}
+
 static void
 xfs_mount_free(
 	struct xfs_mount	*mp)
@@ -1144,6 +1165,7 @@  static const struct super_operations xfs_super_operations = {
 	.destroy_inode		= xfs_fs_destroy_inode,
 	.dirty_inode		= xfs_fs_dirty_inode,
 	.drop_inode		= xfs_fs_drop_inode,
+	.evict_inode		= xfs_fs_evict_inode,
 	.put_super		= xfs_fs_put_super,
 	.sync_fs		= xfs_fs_sync_fs,
 	.freeze_fs		= xfs_fs_freeze,