diff mbox series

Creating large folios in iomap buffered write path

Message ID ZGZwNqYhttjREl0V@casper.infradead.org (mailing list archive)
State Deferred, archived
Headers show
Series Creating large folios in iomap buffered write path | expand

Commit Message

Matthew Wilcox (Oracle) May 18, 2023, 6:36 p.m. UTC
On Wed, May 17, 2023 at 09:07:41PM +0800, Wang Yugui wrote:
> Dave Chinner wrote:
> > I suspect we need to start using high order folios in the write path
> > where we have large user IOs for streaming writes, but I also wonder
> > if there isn't some sort of batched accounting/mapping tree updates
> > we could do for all the adjacent folios in a single bio....
> 
> Is there some comment from Matthew Wilcox?
> since it seems a folios problem?

Not so much a "folio problem" as "an enhancement nobody got around to doing
yet".  Here's a first attempt.  It's still churning through an xfstests
run for me.  I have seen this warning trigger:

                WARN_ON_ONCE(!folio_test_uptodate(folio) &&
                             folio_test_dirty(folio));

in iomap_invalidate_folio() as it's now possible to create a folio
for write that is larger than the write, and therefore we won't
mark it uptodate.  Maybe we should create slightly smaller folios.

Anyway, how does this help your performance problem?

Comments

Matthew Wilcox (Oracle) May 18, 2023, 9:46 p.m. UTC | #1
On Thu, May 18, 2023 at 07:36:38PM +0100, Matthew Wilcox wrote:
> Not so much a "folio problem" as "an enhancement nobody got around to doing
> yet".  Here's a first attempt.  It's still churning through an xfstests
> run for me.  I have seen this warning trigger:
> 
>                 WARN_ON_ONCE(!folio_test_uptodate(folio) &&
>                              folio_test_dirty(folio));
> 
> in iomap_invalidate_folio() as it's now possible to create a folio
> for write that is larger than the write, and therefore we won't
> mark it uptodate.  Maybe we should create slightly smaller folios.

Here's one that does.  A couple of other small problems also fixed.


diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c739b258a2d9..3702e5e47b0f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 	if (status)
 		return ERR_PTR(status);
 
-	folio = iomap_get_folio(iter, pos);
+	folio = iomap_get_folio(iter, pos, len);
 	if (IS_ERR(folio))
 		gfs2_trans_end(sdp);
 	return folio;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 063133ec77f4..32ddddf9f35c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -461,19 +461,25 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  * iomap_get_folio - get a folio reference for writing
  * @iter: iteration structure
  * @pos: start offset of write
+ * @len: length of write
  *
  * Returns a locked reference to the folio at @pos, or an error pointer if the
  * folio could not be obtained.
  */
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
 {
 	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
+	struct folio *folio;
 
 	if (iter->flags & IOMAP_NOWAIT)
 		fgp |= FGP_NOWAIT;
+	fgp |= fgp_order(len);
 
-	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
 			fgp, mapping_gfp_mask(iter->inode->i_mapping));
+	if (!IS_ERR(folio) && folio_test_large(folio))
+		printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio));
+	return folio;
 }
 EXPORT_SYMBOL_GPL(iomap_get_folio);
 
@@ -510,8 +516,8 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 		iomap_page_release(folio);
 	} else if (folio_test_large(folio)) {
 		/* Must release the iop so the page can be split */
-		WARN_ON_ONCE(!folio_test_uptodate(folio) &&
-			     folio_test_dirty(folio));
+		VM_WARN_ON_ONCE_FOLIO(!folio_test_uptodate(folio) &&
+				folio_test_dirty(folio), folio);
 		iomap_page_release(folio);
 	}
 }
@@ -603,7 +609,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
 	if (folio_ops && folio_ops->get_folio)
 		return folio_ops->get_folio(iter, pos, len);
 	else
-		return iomap_get_folio(iter, pos);
+		return iomap_get_folio(iter, pos, len);
 }
 
 static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..80facb9c9e5b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -261,7 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
 void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a56308a9d1a4..f4d05beb64eb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -466,6 +466,19 @@ static inline void *detach_page_private(struct page *page)
 	return folio_detach_private(page_folio(page));
 }
 
+/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size.  I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER	HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER	8
+#endif
+
 #ifdef CONFIG_NUMA
 struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
 #else
@@ -505,14 +518,24 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_NOWAIT		0x00000020
 #define FGP_FOR_MMAP		0x00000040
 #define FGP_STABLE		0x00000080
+#define FGP_ORDER(fgp)		((fgp) >> 26)	/* top 6 bits */
 
 #define FGP_WRITEBEGIN		(FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
 
+static inline unsigned fgp_order(size_t size)
+{
+	unsigned int shift = ilog2(size);
+
+	if (shift <= PAGE_SHIFT)
+		return 0;
+	return (shift - PAGE_SHIFT) << 26;
+}
+
 void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp);
+		unsigned fgp_flags, gfp_t gfp);
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp);
+		unsigned fgp_flags, gfp_t gfp);
 
 /**
  * filemap_get_folio - Find and get a folio.
@@ -586,7 +609,7 @@ static inline struct page *find_get_page(struct address_space *mapping,
 }
 
 static inline struct page *find_get_page_flags(struct address_space *mapping,
-					pgoff_t offset, int fgp_flags)
+					pgoff_t offset, unsigned fgp_flags)
 {
 	return pagecache_get_page(mapping, offset, fgp_flags, 0);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index b4c9bd368b7e..7abbb072d4d9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1910,7 +1910,7 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
  * Return: The found folio or an ERR_PTR() otherwise.
  */
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp)
+		unsigned fgp_flags, gfp_t gfp)
 {
 	struct folio *folio;
 
@@ -1952,7 +1952,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 		folio_wait_stable(folio);
 no_page:
 	if (!folio && (fgp_flags & FGP_CREAT)) {
+		unsigned order = FGP_ORDER(fgp_flags);
 		int err;
+
 		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
 			gfp |= __GFP_WRITE;
 		if (fgp_flags & FGP_NOFS)
@@ -1961,26 +1963,38 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 			gfp &= ~GFP_KERNEL;
 			gfp |= GFP_NOWAIT | __GFP_NOWARN;
 		}
-
-		folio = filemap_alloc_folio(gfp, 0);
-		if (!folio)
-			return ERR_PTR(-ENOMEM);
-
 		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
 			fgp_flags |= FGP_LOCK;
 
-		/* Init accessed so avoid atomic mark_page_accessed later */
-		if (fgp_flags & FGP_ACCESSED)
-			__folio_set_referenced(folio);
+		if (order > MAX_PAGECACHE_ORDER)
+			order = MAX_PAGECACHE_ORDER;
+		/* If we're not aligned, allocate a smaller folio */
+		if (index & ((1UL << order) - 1))
+			order = __ffs(index);
 
-		err = filemap_add_folio(mapping, folio, index, gfp);
-		if (unlikely(err)) {
+		do {
+			err = -ENOMEM;
+			if (order == 1)
+				order = 0;
+			folio = filemap_alloc_folio(gfp, order);
+			if (!folio)
+				continue;
+
+			/* Init accessed so avoid atomic mark_page_accessed later */
+			if (fgp_flags & FGP_ACCESSED)
+				__folio_set_referenced(folio);
+
+			err = filemap_add_folio(mapping, folio, index, gfp);
+			if (!err)
+				break;
 			folio_put(folio);
 			folio = NULL;
-			if (err == -EEXIST)
-				goto repeat;
-		}
+		} while (order-- > 0);
 
+		if (err == -EEXIST)
+			goto repeat;
+		if (err)
+			return ERR_PTR(err);
 		/*
 		 * filemap_add_folio locks the page, and for mmap
 		 * we expect an unlocked page.
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index c6f056c20503..c96e88d9a262 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru);
 
 noinline
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp)
+		unsigned fgp_flags, gfp_t gfp)
 {
 	struct folio *folio;
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 47afbca1d122..59a071badb90 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -462,19 +462,6 @@ static int try_context_readahead(struct address_space *mapping,
 	return 1;
 }
 
-/*
- * There are some parts of the kernel which assume that PMD entries
- * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
- * limit the maximum allocation order to PMD size.  I'm not aware of any
- * assumptions about maximum order if THP are disabled, but 8 seems like
- * a good order (that's 1MB if you're using 4kB pages)
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define MAX_PAGECACHE_ORDER	HPAGE_PMD_ORDER
-#else
-#define MAX_PAGECACHE_ORDER	8
-#endif
-
 static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
 		pgoff_t mark, unsigned int order, gfp_t gfp)
 {
Matthew Wilcox (Oracle) May 18, 2023, 10:03 p.m. UTC | #2
On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote:
> -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
>  {
>  	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
> +	struct folio *folio;
>  
>  	if (iter->flags & IOMAP_NOWAIT)
>  		fgp |= FGP_NOWAIT;
> +	fgp |= fgp_order(len);
>  
> -	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> +	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
>  			fgp, mapping_gfp_mask(iter->inode->i_mapping));
> +	if (!IS_ERR(folio) && folio_test_large(folio))
> +		printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio));
> +	return folio;
>  }

Forgot to take the debugging out.  This should read:

-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
 {
 	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
 	if (iter->flags & IOMAP_NOWAIT)
 		fgp |= FGP_NOWAIT;
+	fgp |= fgp_order(len);
 
 	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
 			fgp, mapping_gfp_mask(iter->inode->i_mapping));
 }
Wang Yugui May 19, 2023, 2:55 a.m. UTC | #3
Hi,

> On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote:
> > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
> >  {
> >  	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
> > +	struct folio *folio;
> >  
> >  	if (iter->flags & IOMAP_NOWAIT)
> >  		fgp |= FGP_NOWAIT;
> > +	fgp |= fgp_order(len);
> >  
> > -	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> > +	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> >  			fgp, mapping_gfp_mask(iter->inode->i_mapping));
> > +	if (!IS_ERR(folio) && folio_test_large(folio))
> > +		printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio));
> > +	return folio;
> >  }
> 
> Forgot to take the debugging out.  This should read:
> 
> -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
>  {
>  	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
>  	if (iter->flags & IOMAP_NOWAIT)
>  		fgp |= FGP_NOWAIT;
> +	fgp |= fgp_order(len);
>  
>  	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
>  			fgp, mapping_gfp_mask(iter->inode->i_mapping));
>  }

I test it (attachment file) on 6.4.0-rc2.
fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 -numjobs=4 -directory=/mnt/test

fio  WRITE: bw=2430MiB/s.
expected value: > 6000MiB/s
so yet no fio write bandwidth improvement.

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2023/05/19
Matthew Wilcox (Oracle) May 19, 2023, 3:38 p.m. UTC | #4
On Fri, May 19, 2023 at 10:55:29AM +0800, Wang Yugui wrote:
> Hi,
> 
> > On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote:
> > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
> > >  {
> > >  	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
> > > +	struct folio *folio;
> > >  
> > >  	if (iter->flags & IOMAP_NOWAIT)
> > >  		fgp |= FGP_NOWAIT;
> > > +	fgp |= fgp_order(len);
> > >  
> > > -	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> > > +	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> > >  			fgp, mapping_gfp_mask(iter->inode->i_mapping));
> > > +	if (!IS_ERR(folio) && folio_test_large(folio))
> > > +		printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio));
> > > +	return folio;
> > >  }
> > 
> > Forgot to take the debugging out.  This should read:
> > 
> > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
> >  {
> >  	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
> >  	if (iter->flags & IOMAP_NOWAIT)
> >  		fgp |= FGP_NOWAIT;
> > +	fgp |= fgp_order(len);
> >  
> >  	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> >  			fgp, mapping_gfp_mask(iter->inode->i_mapping));
> >  }
> 
> I test it (attachment file) on 6.4.0-rc2.
> fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 -numjobs=4 -directory=/mnt/test
> 
> fio  WRITE: bw=2430MiB/s.
> expected value: > 6000MiB/s
> so yet no fio write bandwidth improvement.

That's basically unchanged.  There's no per-page or per-block work being
done in start/end writeback, so if Dave's investigation is applicable
to your situation, I'd expect to see an improvement.

Maybe try the second version of the patch I sent with the debug in,
to confirm you really are seeing large folios being created (you might
want to use printk_ratelimit() instead of printk to ensure it doesn't
overwhelm your system)?  That fio command you were using ought to create
them, but there's always a chance it doesn't.

Perhaps you could use perf (the command Dave used) to see where the time
is being spent.
Wang Yugui May 20, 2023, 1:35 p.m. UTC | #5
Hi,

> On Fri, May 19, 2023 at 10:55:29AM +0800, Wang Yugui wrote:
> > Hi,
> > 
> > > On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote:
> > > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> > > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
> > > >  {
> > > >  	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
> > > > +	struct folio *folio;
> > > >  
> > > >  	if (iter->flags & IOMAP_NOWAIT)
> > > >  		fgp |= FGP_NOWAIT;
> > > > +	fgp |= fgp_order(len);
> > > >  
> > > > -	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> > > > +	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> > > >  			fgp, mapping_gfp_mask(iter->inode->i_mapping));
> > > > +	if (!IS_ERR(folio) && folio_test_large(folio))
> > > > +		printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio));
> > > > +	return folio;
> > > >  }
> > > 
> > > Forgot to take the debugging out.  This should read:
> > > 
> > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
> > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
> > >  {
> > >  	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
> > >  	if (iter->flags & IOMAP_NOWAIT)
> > >  		fgp |= FGP_NOWAIT;
> > > +	fgp |= fgp_order(len);
> > >  
> > >  	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
> > >  			fgp, mapping_gfp_mask(iter->inode->i_mapping));
> > >  }
> > 
> > I test it (attachment file) on 6.4.0-rc2.
> > fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 -numjobs=4 -directory=/mnt/test
> > 
> > fio  WRITE: bw=2430MiB/s.
> > expected value: > 6000MiB/s
> > so yet no fio write bandwidth improvement.
> 
> That's basically unchanged.  There's no per-page or per-block work being
> done in start/end writeback, so if Dave's investigation is applicable
> to your situation, I'd expect to see an improvement.
> 
> Maybe try the second version of the patch I sent with the debug in,
> to confirm you really are seeing large folios being created (you might
> want to use printk_ratelimit() instead of printk to ensure it doesn't
> overwhelm your system)?  That fio command you were using ought to create
> them, but there's always a chance it doesn't.
> 
> Perhaps you could use perf (the command Dave used) to see where the time
> is being spent.

test result of the second version of the patch.

# dmesg |grep 'index\|suppressed'
[   89.376149] index:0 len:292 order:2
[   97.862938] index:0 len:4096 order:2
[   98.340665] index:0 len:4096 order:2
[   98.346633] index:0 len:4096 order:2
[   98.352323] index:0 len:4096 order:2
[   98.359952] index:0 len:4096 order:2
[   98.364015] index:3 len:4096 order:2
[   98.368943] index:0 len:4096 order:2
[   98.374285] index:0 len:4096 order:2
[   98.379163] index:3 len:4096 order:2
[   98.384760] index:0 len:4096 order:2
[  181.103751] iomap_get_folio: 342 callbacks suppressed
[  181.103761] index:0 len:292 order:2


'perf report -g' result:
Samples: 344K of event 'cycles', Event count (approx.): 103747884662
  Children      Self  Command          Shared Object            Symbol
+   58.73%     0.01%  fio              [kernel.kallsyms]        [k] entry_SYSCALL_64_after_hwframe
+   58.72%     0.01%  fio              [kernel.kallsyms]        [k] do_syscall_64
+   58.53%     0.00%  fio              libpthread-2.17.so       [.] 0x00007f83e400e6fd
+   58.47%     0.01%  fio              [kernel.kallsyms]        [k] ksys_write
+   58.45%     0.02%  fio              [kernel.kallsyms]        [k] vfs_write
+   58.41%     0.03%  fio              [kernel.kallsyms]        [k] xfs_file_buffered_write
+   57.96%     0.57%  fio              [kernel.kallsyms]        [k] iomap_file_buffered_write
+   27.57%     1.29%  fio              [kernel.kallsyms]        [k] iomap_write_begin
+   25.32%     0.43%  fio              [kernel.kallsyms]        [k] iomap_get_folio
+   24.84%     0.70%  fio              [kernel.kallsyms]        [k] __filemap_get_folio.part.69
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] ret_from_fork
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] kthread
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] worker_thread
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] process_one_work
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] wb_workfn
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] wb_writeback
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] __writeback_inodes_wb
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] writeback_sb_inodes
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] __writeback_single_inode
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] do_writepages
+   20.11%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] xfs_vm_writepages
+   20.10%     0.00%  kworker/u98:15-  [kernel.kallsyms]        [k] iomap_writepages
+   20.10%     0.69%  kworker/u98:15-  [kernel.kallsyms]        [k] write_cache_pages
+   16.95%     0.39%  fio              [kernel.kallsyms]        [k] copy_page_from_iter_atomic
+   16.53%     0.10%  fio              [kernel.kallsyms]        [k] copyin


'perf report ' result:

Samples: 335K of event 'cycles', Event count (approx.): 108508755052
Overhead  Command          Shared Object        Symbol
  17.70%  fio              [kernel.kallsyms]    [k] rep_movs_alternative
   2.89%  kworker/u98:2-e  [kernel.kallsyms]    [k] native_queued_spin_lock_slowpath
   2.88%  fio              [kernel.kallsyms]    [k] get_page_from_freelist
   2.67%  fio              [kernel.kallsyms]    [k] native_queued_spin_lock_slowpath
   2.59%  fio              [kernel.kallsyms]    [k] asm_exc_nmi
   2.25%  swapper          [kernel.kallsyms]    [k] intel_idle
   1.59%  kworker/u98:2-e  [kernel.kallsyms]    [k] __folio_start_writeback
   1.52%  fio              [kernel.kallsyms]    [k] xas_load
   1.45%  fio              [kernel.kallsyms]    [k] lru_add_fn
   1.44%  fio              [kernel.kallsyms]    [k] xas_descend
   1.32%  fio              [kernel.kallsyms]    [k] iomap_write_begin
   1.29%  fio              [kernel.kallsyms]    [k] __filemap_add_folio
   1.08%  kworker/u98:2-e  [kernel.kallsyms]    [k] folio_clear_dirty_for_io
   1.07%  fio              [kernel.kallsyms]    [k] __folio_mark_dirty
   0.94%  fio              [kernel.kallsyms]    [k] iomap_set_range_uptodate.part.24
   0.93%  fio              [kernel.kallsyms]    [k] node_dirty_ok
   0.92%  kworker/u98:2-e  [kernel.kallsyms]    [k] _raw_spin_lock_irqsave
   0.83%  fio              [kernel.kallsyms]    [k] xas_start
   0.83%  fio              [kernel.kallsyms]    [k] __alloc_pages
   0.83%  fio              [kernel.kallsyms]    [k] _raw_spin_lock_irqsave
   0.81%  kworker/u98:2-e  [kernel.kallsyms]    [k] asm_exc_nmi
   0.79%  fio              [kernel.kallsyms]    [k] percpu_counter_add_batch
   0.75%  kworker/u98:2-e  [kernel.kallsyms]    [k] iomap_writepage_map
   0.74%  kworker/u98:2-e  [kernel.kallsyms]    [k] __mod_lruvec_page_state
   0.70%  fio              fio                  [.] 0x000000000001b1ac
   0.70%  fio              [kernel.kallsyms]    [k] filemap_dirty_folio
   0.69%  kworker/u98:2-e  [kernel.kallsyms]    [k] write_cache_pages
   0.69%  fio              [kernel.kallsyms]    [k] __filemap_get_folio.part.69
   0.67%  kworker/1:0-eve  [kernel.kallsyms]    [k] native_queued_spin_lock_slowpath
   0.66%  fio              [kernel.kallsyms]    [k] __mod_lruvec_page_state
   0.64%  fio              [kernel.kallsyms]    [k] __mod_node_page_state
   0.64%  fio              [kernel.kallsyms]    [k] folio_add_lru
   0.64%  fio              [kernel.kallsyms]    [k] balance_dirty_pages_ratelimited_flags
   0.62%  fio              [kernel.kallsyms]    [k] __mod_memcg_lruvec_state
   0.61%  fio              [kernel.kallsyms]    [k] iomap_write_end
   0.60%  fio              [kernel.kallsyms]    [k] xas_find_conflict
   0.59%  fio              [kernel.kallsyms]    [k] bad_range
   0.58%  kworker/u98:2-e  [kernel.kallsyms]    [k] xas_load
   0.57%  fio              [kernel.kallsyms]    [k] iomap_file_buffered_write
   0.56%  kworker/u98:2-e  [kernel.kallsyms]    [k] percpu_counter_add_batch
   0.49%  fio              [kernel.kallsyms]    [k] __might_resched


'perf top -g -U' result:
Samples: 78K of event 'cycles', 4000 Hz, Event count (approx.): 29400815085 lost: 0/0 drop: 0/0
  Children      Self  Shared Object       Symbol
+   62.59%     0.03%  [kernel]            [k] entry_SYSCALL_64_after_hwframe
+   60.15%     0.02%  [kernel]            [k] do_syscall_64
+   59.45%     0.02%  [kernel]            [k] vfs_write
+   59.09%     0.54%  [kernel]            [k] iomap_file_buffered_write
+   57.41%     0.00%  [kernel]            [k] ksys_write
+   57.36%     0.01%  [kernel]            [k] xfs_file_buffered_write
+   37.82%     0.00%  libpthread-2.17.so  [.] 0x00007fce6f20e6fd
+   26.83%     1.20%  [kernel]            [k] iomap_write_begin
+   24.65%     0.45%  [kernel]            [k] iomap_get_folio
+   24.15%     0.74%  [kernel]            [k] __filemap_get_folio.part.69
+   20.17%     0.00%  [kernel]            [k] __writeback_single_inode
+   20.17%     0.65%  [kernel]            [k] write_cache_pages
+   17.66%     0.43%  [kernel]            [k] copy_page_from_iter_atomic
+   17.18%     0.12%  [kernel]            [k] copyin
+   17.08%    16.71%  [kernel]            [k] rep_movs_alternative
+   16.97%     0.00%  [kernel]            [k] ret_from_fork
+   16.97%     0.00%  [kernel]            [k] kthread
+   16.87%     0.00%  [kernel]            [k] worker_thread
+   16.84%     0.00%  [kernel]            [k] process_one_work
+   14.86%     0.17%  [kernel]            [k] filemap_add_folio
+   13.83%     0.77%  [kernel]            [k] iomap_writepage_map
+   11.90%     0.33%  [kernel]            [k] iomap_finish_ioend
+   11.57%     0.23%  [kernel]            [k] folio_end_writeback
+   11.51%     0.73%  [kernel]            [k] iomap_write_end
+   11.30%     2.14%  [kernel]            [k] __folio_end_writeback
+   10.70%     0.00%  [kernel]            [k] wb_workfn
+   10.70%     0.00%  [kernel]            [k] wb_writeback
+   10.70%     0.00%  [kernel]            [k] __writeback_inodes_wb
+   10.70%     0.00%  [kernel]            [k] writeback_sb_inodes
+   10.70%     0.00%  [kernel]            [k] do_writepages
+   10.70%     0.00%  [kernel]            [k] xfs_vm_writepages
+   10.70%     0.00%  [kernel]            [k] iomap_writepages
+   10.19%     2.68%  [kernel]            [k] _raw_spin_lock_irqsave
+   10.17%     1.35%  [kernel]            [k] __filemap_add_folio
+    9.94%     0.00%  [unknown]           [k] 0x0000000001942a70
+    9.94%     0.00%  [unknown]           [k] 0x0000000001942ac0
+    9.94%     0.00%  [unknown]           [k] 0x0000000001942b30

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2023/05/20
Matthew Wilcox (Oracle) May 20, 2023, 4:35 p.m. UTC | #6
On Sat, May 20, 2023 at 09:35:32PM +0800, Wang Yugui wrote:
> test result of the second version of the patch.
> 
> # dmesg |grep 'index\|suppressed'
> [   89.376149] index:0 len:292 order:2
> [   97.862938] index:0 len:4096 order:2
> [   98.340665] index:0 len:4096 order:2
> [   98.346633] index:0 len:4096 order:2
> [   98.352323] index:0 len:4096 order:2
> [   98.359952] index:0 len:4096 order:2
> [   98.364015] index:3 len:4096 order:2
> [   98.368943] index:0 len:4096 order:2
> [   98.374285] index:0 len:4096 order:2
> [   98.379163] index:3 len:4096 order:2
> [   98.384760] index:0 len:4096 order:2
> [  181.103751] iomap_get_folio: 342 callbacks suppressed
> [  181.103761] index:0 len:292 order:2

Thanks.  Clearly we're not creating large folios in the write path.
I tracked it down, and used your fio command.  My system creates 1MB
folios, so I think yours will too.  Patch series incoming (I fixed a
couple of other oversights too).
diff mbox series

Patch

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c739b258a2d9..3702e5e47b0f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -971,7 +971,7 @@  gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 	if (status)
 		return ERR_PTR(status);
 
-	folio = iomap_get_folio(iter, pos);
+	folio = iomap_get_folio(iter, pos, len);
 	if (IS_ERR(folio))
 		gfs2_trans_end(sdp);
 	return folio;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 063133ec77f4..21f33731617a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -461,16 +461,18 @@  EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  * iomap_get_folio - get a folio reference for writing
  * @iter: iteration structure
  * @pos: start offset of write
+ * @len: length of write
  *
  * Returns a locked reference to the folio at @pos, or an error pointer if the
  * folio could not be obtained.
  */
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
 {
 	unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
 
 	if (iter->flags & IOMAP_NOWAIT)
 		fgp |= FGP_NOWAIT;
+	fgp |= fgp_order(len);
 
 	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
 			fgp, mapping_gfp_mask(iter->inode->i_mapping));
@@ -603,7 +605,7 @@  static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
 	if (folio_ops && folio_ops->get_folio)
 		return folio_ops->get_folio(iter, pos, len);
 	else
-		return iomap_get_folio(iter, pos);
+		return iomap_get_folio(iter, pos, len);
 }
 
 static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..80facb9c9e5b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -261,7 +261,7 @@  int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
 void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a56308a9d1a4..5d1341862c5d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -466,6 +466,19 @@  static inline void *detach_page_private(struct page *page)
 	return folio_detach_private(page_folio(page));
 }
 
+/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size.  I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER	HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER	8
+#endif
+
 #ifdef CONFIG_NUMA
 struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
 #else
@@ -505,14 +518,20 @@  pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_NOWAIT		0x00000020
 #define FGP_FOR_MMAP		0x00000040
 #define FGP_STABLE		0x00000080
+#define FGP_ORDER(fgp)		((fgp) >> 26)	/* top 6 bits */
 
 #define FGP_WRITEBEGIN		(FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
 
+static inline unsigned fgp_order(size_t size)
+{
+	return get_order(size) << 26;
+}
+
 void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp);
+		unsigned fgp_flags, gfp_t gfp);
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp);
+		unsigned fgp_flags, gfp_t gfp);
 
 /**
  * filemap_get_folio - Find and get a folio.
@@ -586,7 +605,7 @@  static inline struct page *find_get_page(struct address_space *mapping,
 }
 
 static inline struct page *find_get_page_flags(struct address_space *mapping,
-					pgoff_t offset, int fgp_flags)
+					pgoff_t offset, unsigned fgp_flags)
 {
 	return pagecache_get_page(mapping, offset, fgp_flags, 0);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index b4c9bd368b7e..2eab5e6b6646 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1910,7 +1910,7 @@  void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
  * Return: The found folio or an ERR_PTR() otherwise.
  */
 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp)
+		unsigned fgp_flags, gfp_t gfp)
 {
 	struct folio *folio;
 
@@ -1952,7 +1952,9 @@  struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 		folio_wait_stable(folio);
 no_page:
 	if (!folio && (fgp_flags & FGP_CREAT)) {
+		unsigned order = fgp_order(fgp_flags);
 		int err;
+
 		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
 			gfp |= __GFP_WRITE;
 		if (fgp_flags & FGP_NOFS)
@@ -1961,26 +1963,38 @@  struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 			gfp &= ~GFP_KERNEL;
 			gfp |= GFP_NOWAIT | __GFP_NOWARN;
 		}
-
-		folio = filemap_alloc_folio(gfp, 0);
-		if (!folio)
-			return ERR_PTR(-ENOMEM);
-
 		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
 			fgp_flags |= FGP_LOCK;
 
-		/* Init accessed so avoid atomic mark_page_accessed later */
-		if (fgp_flags & FGP_ACCESSED)
-			__folio_set_referenced(folio);
+		if (order > MAX_PAGECACHE_ORDER)
+			order = MAX_PAGECACHE_ORDER;
+		/* If we're not aligned, allocate a smaller folio */
+		if (index & ((1UL << order) - 1))
+			order = __ffs(index);
 
-		err = filemap_add_folio(mapping, folio, index, gfp);
-		if (unlikely(err)) {
+		do {
+			err = -ENOMEM;
+			if (order == 1)
+				order = 0;
+			folio = filemap_alloc_folio(gfp, order);
+			if (!folio)
+				continue;
+
+			/* Init accessed so avoid atomic mark_page_accessed later */
+			if (fgp_flags & FGP_ACCESSED)
+				__folio_set_referenced(folio);
+
+			err = filemap_add_folio(mapping, folio, index, gfp);
+			if (!err)
+				break;
 			folio_put(folio);
 			folio = NULL;
-			if (err == -EEXIST)
-				goto repeat;
-		}
+		} while (order-- > 0);
 
+		if (err == -EEXIST)
+			goto repeat;
+		if (err)
+			return ERR_PTR(err);
 		/*
 		 * filemap_add_folio locks the page, and for mmap
 		 * we expect an unlocked page.
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index c6f056c20503..c96e88d9a262 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -92,7 +92,7 @@  EXPORT_SYMBOL(add_to_page_cache_lru);
 
 noinline
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
-		int fgp_flags, gfp_t gfp)
+		unsigned fgp_flags, gfp_t gfp)
 {
 	struct folio *folio;
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 47afbca1d122..59a071badb90 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -462,19 +462,6 @@  static int try_context_readahead(struct address_space *mapping,
 	return 1;
 }
 
-/*
- * There are some parts of the kernel which assume that PMD entries
- * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
- * limit the maximum allocation order to PMD size.  I'm not aware of any
- * assumptions about maximum order if THP are disabled, but 8 seems like
- * a good order (that's 1MB if you're using 4kB pages)
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define MAX_PAGECACHE_ORDER	HPAGE_PMD_ORDER
-#else
-#define MAX_PAGECACHE_ORDER	8
-#endif
-
 static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
 		pgoff_t mark, unsigned int order, gfp_t gfp)
 {