@@ -395,7 +395,7 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
+ WARN_ON_ONCE(trunc && page_maybe_dma_pinned(page));
if (dax_mapping_is_cow(page->mapping)) {
/* keep the CoW flag if this page is still shared */
if (page->index-- > 0)
@@ -414,7 +414,7 @@ static struct page *dax_pinned_page(void *entry)
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- if (page_ref_count(page) > 1)
+ if (page_maybe_dma_pinned(page))
return page;
}
return NULL;
@@ -3961,10 +3961,9 @@ int ext4_break_layouts(struct inode *inode)
if (!page)
return 0;
- error = ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1,
- TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(inode));
+ error = ___wait_var_event(page, !page_maybe_dma_pinned(page),
+ TASK_INTERRUPTIBLE, 0, 0,
+ ext4_wait_dax_page(inode));
} while (error == 0);
return error;
@@ -676,9 +676,9 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
return 0;
*retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, fuse_wait_dax_page(inode));
+ return ___wait_var_event(page, !page_maybe_dma_pinned(page),
+ TASK_INTERRUPTIBLE, 0, 0,
+ fuse_wait_dax_page(inode));
}
/* dmap_end == 0 leads to unmapping of whole file */
@@ -827,9 +827,9 @@ xfs_break_dax_layouts(
return 0;
*retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, xfs_wait_dax_page(inode));
+ return ___wait_var_event(page, !page_maybe_dma_pinned(page),
+ TASK_INTERRUPTIBLE, 0, 0,
+ xfs_wait_dax_page(inode));
}
int
@@ -1517,6 +1517,34 @@ static inline bool page_maybe_dma_pinned(struct page *page)
return folio_maybe_dma_pinned(page_folio(page));
}
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
+/*
+ * Unlike typical file backed pages that support truncating a page from
+ * a file while it is under active DMA, DAX pages need to hold off
+ * truncate operations until transient page pins are released.
+ *
+ * The filesystem (via dax_layout_pinned_page()) takes steps to make
+ * sure that any observation of the !page_maybe_dma_pinned() state is
+ * stable until the truncation completes.
+ */
+static inline void wakeup_fsdax_pin_waiters(struct folio *folio)
+{
+ struct page *page = &folio->page;
+
+ if (!folio_is_zone_device(folio))
+ return;
+ if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+ return;
+ if (folio_maybe_dma_pinned(folio))
+ return;
+ wake_up_var(page);
+}
+#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+static inline void wakeup_fsdax_pin_waiters(struct folio *folio)
+{
+}
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+
/*
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for an anon page on the src mm.
@@ -177,8 +177,10 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_page_refs(&folio->page, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
+
+ if (flags & FOLL_PIN)
+ wakeup_fsdax_pin_waiters(folio);
}
/**
The pin_user_pages() + page_maybe_dma_pinned() infrastructure is a framework for tackling the kernel's struggles with gup+DMA. DAX presents a unique flavor of the gup+DMA problem since pinned pages are identical to physical filesystem blocks. Unlike the page-cache case, a mapping of a file can not be truncated while DMA is in-flight because the DMA must complete before the filesystem block is reclaimed. DAX has a homegrown solution to this problem based on watching the page->_refcount go idle. Beyond being awkward to catch that idle transition in put_page(), it is overkill when only the page_maybe_dma_pinned() transition needs to be captured. Move the wakeup of filesystem-DAX truncate paths ({ext4,xfs,fuse_dax}_break_layouts()) to unpin_user_pages() with a new wakeup_fsdax_pin_waiters() helper, and use !page_maybe_dma_pinned() as the wake condition. Cc: Jan Kara <jack@suse.cz> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: John Hubbard <jhubbard@nvidia.com> Reported-by: Jason Gunthorpe <jgg@nvidia.com> Reported-by: Matthew Wilcox <willy@infradead.org> Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- fs/dax.c | 4 ++-- fs/ext4/inode.c | 7 +++---- fs/fuse/dax.c | 6 +++--- fs/xfs/xfs_file.c | 6 +++--- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ mm/gup.c | 6 ++++-- 6 files changed, 43 insertions(+), 14 deletions(-)