diff mbox series

[RFC,v1] mm/page cache: A method to drop one-read page cache

Message ID 20250302061427.33455-1-a929244872@163.com (mailing list archive)
State New
Headers show
Series [RFC,v1] mm/page cache: A method to drop one-read page cache | expand

Commit Message

wang wei March 2, 2025, 6:14 a.m. UTC
Jens Axboe added a new flag, RWF_DONTCACHE, which is provided to 
the preadv2() and pwritev2() system calls [1]. This feature is 
particularly suitable for pages that are read only once, as it 
allows these pages to both benefit from the fast data access 
provided by the page cache and be quickly cleared from memory 
without needing to go through kswapd.However, in most cases, 
programmers cannot distinguish between pages that are read 
only once and those that are not. This made me think of the 
refault distance feature. If it is accessed again shortly 
after being evicted, the refault distance feature moves it 
to the active LRU. Conversely, if a page is being added to 
the page cache for the first time or has not been accessed 
again for an extended period after being evicted, it is placed 
in the inactive LRU list. IMO, pages that remain unaccessed 
for a long time after being evicted can be considered as 
"one time read" pages. By setting the DONTCACHE flag for 
such pages, the system can quickly remove them from memory.

After the patch, the refault distance feature categorizes 
file pages into three types.

1.Pages accessed again shortly after being evicted:
These pages are moved to the active LRU list to maintain 
their efficient access in memory.

2.Pages added to the page cache for the first time:
These pages are directly placed in the inactive LRU list,
waiting for subsequent accesses to determine their activity.

3.Pages that remain unaccessed for a long time after being evicted:
These pages are marked with the DONTCACHE flag, allowing 
the system to quickly remove them from memory without involving 
kswapd for complex eviction operations.

But after this patch, a new issue may arise. If the system 
runs for a long time, all pages will have been accessed at 
least once, leaving no pages for the first access. When a page 
is added to the page cache, it will either be moved to the 
active LRU list or marked with the DONTCACHE flag, which may 
degrade system performance. This requires readjusting the thresholds
for identifying the three types of pages. Therefore, I would 
like to seek help from the community to understand what new 
problems this modification might introduce and how to address them.

[1]: https://lore.kernel.org/all/20241220154831.1086649-1-axboe@kernel.dk/

Signed-off-by: wang wei <a929244872@163.com>
---
 mm/filemap.c    | 18 ++++++++++++------
 mm/internal.h   |  2 +-
 mm/truncate.c   |  6 +++---
 mm/workingset.c |  5 ++++-
 4 files changed, 20 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/mm/filemap.c b/mm/filemap.c
index 804d73656..ee7afff9f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1606,7 +1606,7 @@  static void folio_end_dropbehind_write(struct folio *folio)
 	 */
 	if (in_task() && folio_trylock(folio)) {
 		if (folio->mapping)
-			folio_unmap_invalidate(folio->mapping, folio, 0);
+			folio_unmap_invalidate(folio->mapping, folio, 0, NULL);
 		folio_unlock(folio);
 	}
 }
@@ -2625,15 +2625,21 @@  static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
 }
 
 static void filemap_end_dropbehind_read(struct address_space *mapping,
-					struct folio *folio)
+					struct folio *folio, int ki_flags)
 {
+	void *shadow = NULL;
+
 	if (!folio_test_dropbehind(folio))
 		return;
 	if (folio_test_writeback(folio) || folio_test_dirty(folio))
 		return;
 	if (folio_trylock(folio)) {
-		if (folio_test_clear_dropbehind(folio))
-			folio_unmap_invalidate(mapping, folio, 0);
+		if (folio_test_clear_dropbehind(folio)) {
+			/* If this foio is dropped by preadv2(), do not record eviction*/
+			if (!(ki_flags & IOCB_DONTCACHE))
+				shadow = workingset_eviction(folio, folio_memcg(folio));
+			folio_unmap_invalidate(mapping, folio, 0, shadow);
+		}
 		folio_unlock(folio);
 	}
 }
@@ -2754,7 +2760,7 @@  ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			struct folio *folio = fbatch.folios[i];
 
-			filemap_end_dropbehind_read(mapping, folio);
+			filemap_end_dropbehind_read(mapping, folio, iocb->ki_flags);
 			folio_put(folio);
 		}
 		folio_batch_init(&fbatch);
@@ -3455,7 +3461,7 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 			mapping_locked = true;
 		}
 		folio = __filemap_get_folio(mapping, index,
-					  FGP_CREAT|FGP_FOR_MMAP,
+					  FGP_CREAT|FGP_FOR_MMAP|FGP_DONTCACHE,
 					  vmf->gfp_mask);
 		if (IS_ERR(folio)) {
 			if (fpin)
diff --git a/mm/internal.h b/mm/internal.h
index 109ef30fe..5f9a5b6c4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -393,7 +393,7 @@  void unmap_page_range(struct mmu_gather *tlb,
 			     unsigned long addr, unsigned long end,
 			     struct zap_details *details);
 int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
-			   gfp_t gfp);
+			   gfp_t gfp, void *shadow);
 
 void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
 		unsigned int order);
diff --git a/mm/truncate.c b/mm/truncate.c
index e2e115adf..204006a9d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -542,7 +542,7 @@  static int folio_launder(struct address_space *mapping, struct folio *folio)
  * sitting in the folio_add_lru() caches.
  */
 int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
-			   gfp_t gfp)
+			   gfp_t gfp, void *shadow)
 {
 	int ret;
 
@@ -568,7 +568,7 @@  int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
 		goto failed;
 
 	BUG_ON(folio_has_private(folio));
-	__filemap_remove_folio(folio, NULL);
+	__filemap_remove_folio(folio, shadow);
 	xa_unlock_irq(&mapping->i_pages);
 	if (mapping_shrinkable(mapping))
 		inode_add_lru(mapping->host);
@@ -643,7 +643,7 @@  int invalidate_inode_pages2_range(struct address_space *mapping,
 			}
 			VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
 			folio_wait_writeback(folio);
-			ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
+			ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL, NULL);
 			if (ret2 < 0)
 				ret = ret2;
 			folio_unlock(folio);
diff --git a/mm/workingset.c b/mm/workingset.c
index 4841ae8af..e606ce0c5 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -563,7 +563,10 @@  void workingset_refault(struct folio *folio, void *shadow)
 
 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
 
-	if (!workingset_test_recent(shadow, file, &workingset, true))
+	if (!workingset_test_recent(shadow, file, &workingset, true)) {
+		if (file && folio->mapping)
+			__folio_set_dropbehind(folio);
+	}
 		return;
 
 	folio_set_active(folio);