@@ -803,13 +803,37 @@ static int __dax_invalidate_entry(struct address_space *mapping,
return ret;
}
+/*
+ * wait indefinitely for all pins to drop, the alternative to waiting is
+ * a potential use-after-free scenario
+ */
+static void dax_break_layout(struct address_space *mapping, pgoff_t index)
+{
+ /* To do this without locks, the inode needs to be unreferenced */
+ WARN_ON(atomic_read(&mapping->host->i_count));
+ do {
+ struct page *page;
+
+ page = dax_zap_mappings_range(mapping, index << PAGE_SHIFT,
+ (index + 1) << PAGE_SHIFT);
+ if (!page)
+ return;
+ wait_var_event(page, dax_page_idle(page));
+ } while (true);
+}
+
/*
* Delete DAX entry at @index from @mapping. Wait for it
* to be unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
- int ret = __dax_invalidate_entry(mapping, index, true);
+ int ret;
+
+ if (mapping_exiting(mapping))
+ dax_break_layout(mapping, index);
+
+ ret = __dax_invalidate_entry(mapping, index, true);
/*
* This gets called from truncate / punch_hole path. As such, the caller
The fsdax truncate vs page pinning solution is incomplete. The initial solution landed in v4.17 and covered typical truncate invoked through truncate(2) and fallocate(2), i.e. the truncate_inode_pages() called on open files. However, that enabling left truncate_inode_pages_final(), called after iput_final() to free the inode, unprotected. Thankfully that v4.17 enabling also left a warning in place to fire if any truncate is attempted while a DAX page is still pinned: commit d2c997c0f145 ("fs, dax: use page->mapping to warn if truncate collides with a busy page") While a lore search indicates no reports of that firing, the hole is there nonetheless. The concern is that if/when that warning fires it indicates a use-after-free condition whereby the filesystem has lost the ability to arbitrate access to its storage blocks. For example, in the worst case, DMA may be ongoing while the filesystem thinks the block is free to be reallocated to another inode. This patch is based on an observation from Dave that during iput_final() there is no need to hold filesystem locks like the explicit truncate path. The wait can occur from within dax_delete_mapping_entry() called by truncate_folio_batch_exceptionals(). This solution trades off fixing the use-after-free with a theoretical deadlock scenario. If the agent holding the page pin triggers inode reclaim and that reclaim waits for the pin to drop it will deadlock. Two observations make this approach still worth pursuing: 1/ Any existing scenarios where that happens would have triggered the warning referenced above which has shipped upstream for ~5 years without a bug report on lore. 2/ Most I/O drivers only hold page pins in their fast paths and new __GFP_FS allocations are unlikely in a driver fast path. I.e. if the deadlock triggers the likely fix would be in the offending driver, not new band-aids in fsdax. So, update the DAX core to notice that the inode->i_mapping is in the exiting state and use that as a signal that the inode is unreferenced await page-pins to drain. Cc: Matthew Wilcox <willy@infradead.org> Cc: Jan Kara <jack@suse.cz> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Christoph Hellwig <hch@lst.de> Cc: John Hubbard <jhubbard@nvidia.com> Reported-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- fs/dax.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-)