Message ID | ZGZwNqYhttjREl0V@casper.infradead.org (mailing list archive) |
---|---|
State | Deferred, archived |
Headers | show |
Series | Creating large folios in iomap buffered write path | expand |
On Thu, May 18, 2023 at 07:36:38PM +0100, Matthew Wilcox wrote: > Not so much a "folio problem" as "an enhancement nobody got around to doing > yet". Here's a first attempt. It's still churning through an xfstests > run for me. I have seen this warning trigger: > > WARN_ON_ONCE(!folio_test_uptodate(folio) && > folio_test_dirty(folio)); > > in iomap_invalidate_folio() as it's now possible to create a folio > for write that is larger than the write, and therefore we won't > mark it uptodate. Maybe we should create slightly smaller folios. Here's one that does. A couple of other small problems also fixed. diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c739b258a2d9..3702e5e47b0f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) if (status) return ERR_PTR(status); - folio = iomap_get_folio(iter, pos); + folio = iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) gfs2_trans_end(sdp); return folio; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 063133ec77f4..32ddddf9f35c 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -461,19 +461,25 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); * iomap_get_folio - get a folio reference for writing * @iter: iteration structure * @pos: start offset of write + * @len: length of write * * Returns a locked reference to the folio at @pos, or an error pointer if the * folio could not be obtained. */ -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; + struct folio *folio; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + fgp |= fgp_order(len); - return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); + if (!IS_ERR(folio) && folio_test_large(folio)) + printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio)); + return folio; } EXPORT_SYMBOL_GPL(iomap_get_folio); @@ -510,8 +516,8 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) iomap_page_release(folio); } else if (folio_test_large(folio)) { /* Must release the iop so the page can be split */ - WARN_ON_ONCE(!folio_test_uptodate(folio) && - folio_test_dirty(folio)); + VM_WARN_ON_ONCE_FOLIO(!folio_test_uptodate(folio) && + folio_test_dirty(folio), folio); iomap_page_release(folio); } } @@ -603,7 +609,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); else - return iomap_get_folio(iter, pos); + return iomap_get_folio(iter, pos, len); } static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e119..80facb9c9e5b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -261,7 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos); +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a56308a9d1a4..f4d05beb64eb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -466,6 +466,19 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); #else @@ -505,14 +518,24 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 #define FGP_STABLE 0x00000080 +#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned fgp_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + return (shift - PAGE_SHIFT) << 26; +} + void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + unsigned fgp_flags, gfp_t gfp); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + unsigned fgp_flags, gfp_t gfp); /** * filemap_get_folio - Find and get a folio. @@ -586,7 +609,7 @@ static inline struct page *find_get_page(struct address_space *mapping, } static inline struct page *find_get_page_flags(struct address_space *mapping, - pgoff_t offset, int fgp_flags) + pgoff_t offset, unsigned fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..7abbb072d4d9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1910,7 +1910,7 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + unsigned fgp_flags, gfp_t gfp) { struct folio *folio; @@ -1952,7 +1952,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { + unsigned order = FGP_ORDER(fgp_flags); int err; + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) @@ -1961,26 +1963,38 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT | __GFP_NOWARN; } - - folio = filemap_alloc_folio(gfp, 0); - if (!folio) - return ERR_PTR(-ENOMEM); - if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoid atomic mark_page_accessed later */ - if (fgp_flags & FGP_ACCESSED) - __folio_set_referenced(folio); + if (order > MAX_PAGECACHE_ORDER) + order = MAX_PAGECACHE_ORDER; + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); - err = filemap_add_folio(mapping, folio, index, gfp); - if (unlikely(err)) { + do { + err = -ENOMEM; + if (order == 1) + order = 0; + folio = filemap_alloc_folio(gfp, order); + if (!folio) + continue; + + /* Init accessed so avoid atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + __folio_set_referenced(folio); + + err = filemap_add_folio(mapping, folio, index, gfp); + if (!err) + break; folio_put(folio); folio = NULL; - if (err == -EEXIST) - goto repeat; - } + } while (order-- > 0); + if (err == -EEXIST) + goto repeat; + if (err) + return ERR_PTR(err); /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. diff --git a/mm/folio-compat.c b/mm/folio-compat.c index c6f056c20503..c96e88d9a262 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + unsigned fgp_flags, gfp_t gfp) { struct folio *folio; diff --git a/mm/readahead.c b/mm/readahead.c index 47afbca1d122..59a071badb90 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -462,19 +462,6 @@ static int try_context_readahead(struct address_space *mapping, return 1; } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) {
On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote: > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) > { > unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; > + struct folio *folio; > > if (iter->flags & IOMAP_NOWAIT) > fgp |= FGP_NOWAIT; > + fgp |= fgp_order(len); > > - return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > fgp, mapping_gfp_mask(iter->inode->i_mapping)); > + if (!IS_ERR(folio) && folio_test_large(folio)) > + printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio)); > + return folio; > } Forgot to take the debugging out. This should read: -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + fgp |= fgp_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); }
Hi, > On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote: > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) > > { > > unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; > > + struct folio *folio; > > > > if (iter->flags & IOMAP_NOWAIT) > > fgp |= FGP_NOWAIT; > > + fgp |= fgp_order(len); > > > > - return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > fgp, mapping_gfp_mask(iter->inode->i_mapping)); > > + if (!IS_ERR(folio) && folio_test_large(folio)) > > + printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio)); > > + return folio; > > } > > Forgot to take the debugging out. This should read: > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) > { > unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; > if (iter->flags & IOMAP_NOWAIT) > fgp |= FGP_NOWAIT; > + fgp |= fgp_order(len); > > return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > fgp, mapping_gfp_mask(iter->inode->i_mapping)); > } I test it (attachment file) on 6.4.0-rc2. fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 -numjobs=4 -directory=/mnt/test fio WRITE: bw=2430MiB/s. expected value: > 6000MiB/s so yet no fio write bandwidth improvement. Best Regards Wang Yugui (wangyugui@e16-tech.com) 2023/05/19
On Fri, May 19, 2023 at 10:55:29AM +0800, Wang Yugui wrote: > Hi, > > > On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote: > > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) > > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) > > > { > > > unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; > > > + struct folio *folio; > > > > > > if (iter->flags & IOMAP_NOWAIT) > > > fgp |= FGP_NOWAIT; > > > + fgp |= fgp_order(len); > > > > > > - return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > > + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > > fgp, mapping_gfp_mask(iter->inode->i_mapping)); > > > + if (!IS_ERR(folio) && folio_test_large(folio)) > > > + printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio)); > > > + return folio; > > > } > > > > Forgot to take the debugging out. This should read: > > > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) > > { > > unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; > > if (iter->flags & IOMAP_NOWAIT) > > fgp |= FGP_NOWAIT; > > + fgp |= fgp_order(len); > > > > return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > fgp, mapping_gfp_mask(iter->inode->i_mapping)); > > } > > I test it (attachment file) on 6.4.0-rc2. > fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 -numjobs=4 -directory=/mnt/test > > fio WRITE: bw=2430MiB/s. > expected value: > 6000MiB/s > so yet no fio write bandwidth improvement. That's basically unchanged. There's no per-page or per-block work being done in start/end writeback, so if Dave's investigation is applicable to your situation, I'd expect to see an improvement. Maybe try the second version of the patch I sent with the debug in, to confirm you really are seeing large folios being created (you might want to use printk_ratelimit() instead of printk to ensure it doesn't overwhelm your system)? That fio command you were using ought to create them, but there's always a chance it doesn't. Perhaps you could use perf (the command Dave used) to see where the time is being spent.
Hi, > On Fri, May 19, 2023 at 10:55:29AM +0800, Wang Yugui wrote: > > Hi, > > > > > On Thu, May 18, 2023 at 10:46:43PM +0100, Matthew Wilcox wrote: > > > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) > > > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) > > > > { > > > > unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; > > > > + struct folio *folio; > > > > > > > > if (iter->flags & IOMAP_NOWAIT) > > > > fgp |= FGP_NOWAIT; > > > > + fgp |= fgp_order(len); > > > > > > > > - return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > > > + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > > > fgp, mapping_gfp_mask(iter->inode->i_mapping)); > > > > + if (!IS_ERR(folio) && folio_test_large(folio)) > > > > + printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio)); > > > > + return folio; > > > > } > > > > > > Forgot to take the debugging out. This should read: > > > > > > -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) > > > +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) > > > { > > > unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; > > > if (iter->flags & IOMAP_NOWAIT) > > > fgp |= FGP_NOWAIT; > > > + fgp |= fgp_order(len); > > > > > > return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, > > > fgp, mapping_gfp_mask(iter->inode->i_mapping)); > > > } > > > > I test it (attachment file) on 6.4.0-rc2. > > fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 -numjobs=4 -directory=/mnt/test > > > > fio WRITE: bw=2430MiB/s. > > expected value: > 6000MiB/s > > so yet no fio write bandwidth improvement. > > That's basically unchanged. There's no per-page or per-block work being > done in start/end writeback, so if Dave's investigation is applicable > to your situation, I'd expect to see an improvement. > > Maybe try the second version of the patch I sent with the debug in, > to confirm you really are seeing large folios being created (you might > want to use printk_ratelimit() instead of printk to ensure it doesn't > overwhelm your system)? That fio command you were using ought to create > them, but there's always a chance it doesn't. > > Perhaps you could use perf (the command Dave used) to see where the time > is being spent. test result of the second version of the patch. # dmesg |grep 'index\|suppressed' [ 89.376149] index:0 len:292 order:2 [ 97.862938] index:0 len:4096 order:2 [ 98.340665] index:0 len:4096 order:2 [ 98.346633] index:0 len:4096 order:2 [ 98.352323] index:0 len:4096 order:2 [ 98.359952] index:0 len:4096 order:2 [ 98.364015] index:3 len:4096 order:2 [ 98.368943] index:0 len:4096 order:2 [ 98.374285] index:0 len:4096 order:2 [ 98.379163] index:3 len:4096 order:2 [ 98.384760] index:0 len:4096 order:2 [ 181.103751] iomap_get_folio: 342 callbacks suppressed [ 181.103761] index:0 len:292 order:2 'perf report -g' result: Samples: 344K of event 'cycles', Event count (approx.): 103747884662 Children Self Command Shared Object Symbol + 58.73% 0.01% fio [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe + 58.72% 0.01% fio [kernel.kallsyms] [k] do_syscall_64 + 58.53% 0.00% fio libpthread-2.17.so [.] 0x00007f83e400e6fd + 58.47% 0.01% fio [kernel.kallsyms] [k] ksys_write + 58.45% 0.02% fio [kernel.kallsyms] [k] vfs_write + 58.41% 0.03% fio [kernel.kallsyms] [k] xfs_file_buffered_write + 57.96% 0.57% fio [kernel.kallsyms] [k] iomap_file_buffered_write + 27.57% 1.29% fio [kernel.kallsyms] [k] iomap_write_begin + 25.32% 0.43% fio [kernel.kallsyms] [k] iomap_get_folio + 24.84% 0.70% fio [kernel.kallsyms] [k] __filemap_get_folio.part.69 + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] ret_from_fork + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] kthread + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] worker_thread + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] process_one_work + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] wb_workfn + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] wb_writeback + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] __writeback_inodes_wb + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] writeback_sb_inodes + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] __writeback_single_inode + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] do_writepages + 20.11% 0.00% kworker/u98:15- [kernel.kallsyms] [k] xfs_vm_writepages + 20.10% 0.00% kworker/u98:15- [kernel.kallsyms] [k] iomap_writepages + 20.10% 0.69% kworker/u98:15- [kernel.kallsyms] [k] write_cache_pages + 16.95% 0.39% fio [kernel.kallsyms] [k] copy_page_from_iter_atomic + 16.53% 0.10% fio [kernel.kallsyms] [k] copyin 'perf report ' result: Samples: 335K of event 'cycles', Event count (approx.): 108508755052 Overhead Command Shared Object Symbol 17.70% fio [kernel.kallsyms] [k] rep_movs_alternative 2.89% kworker/u98:2-e [kernel.kallsyms] [k] native_queued_spin_lock_slowpath 2.88% fio [kernel.kallsyms] [k] get_page_from_freelist 2.67% fio [kernel.kallsyms] [k] native_queued_spin_lock_slowpath 2.59% fio [kernel.kallsyms] [k] asm_exc_nmi 2.25% swapper [kernel.kallsyms] [k] intel_idle 1.59% kworker/u98:2-e [kernel.kallsyms] [k] __folio_start_writeback 1.52% fio [kernel.kallsyms] [k] xas_load 1.45% fio [kernel.kallsyms] [k] lru_add_fn 1.44% fio [kernel.kallsyms] [k] xas_descend 1.32% fio [kernel.kallsyms] [k] iomap_write_begin 1.29% fio [kernel.kallsyms] [k] __filemap_add_folio 1.08% kworker/u98:2-e [kernel.kallsyms] [k] folio_clear_dirty_for_io 1.07% fio [kernel.kallsyms] [k] __folio_mark_dirty 0.94% fio [kernel.kallsyms] [k] iomap_set_range_uptodate.part.24 0.93% fio [kernel.kallsyms] [k] node_dirty_ok 0.92% kworker/u98:2-e [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.83% fio [kernel.kallsyms] [k] xas_start 0.83% fio [kernel.kallsyms] [k] __alloc_pages 0.83% fio [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.81% kworker/u98:2-e [kernel.kallsyms] [k] asm_exc_nmi 0.79% fio [kernel.kallsyms] [k] percpu_counter_add_batch 0.75% kworker/u98:2-e [kernel.kallsyms] [k] iomap_writepage_map 0.74% kworker/u98:2-e [kernel.kallsyms] [k] __mod_lruvec_page_state 0.70% fio fio [.] 0x000000000001b1ac 0.70% fio [kernel.kallsyms] [k] filemap_dirty_folio 0.69% kworker/u98:2-e [kernel.kallsyms] [k] write_cache_pages 0.69% fio [kernel.kallsyms] [k] __filemap_get_folio.part.69 0.67% kworker/1:0-eve [kernel.kallsyms] [k] native_queued_spin_lock_slowpath 0.66% fio [kernel.kallsyms] [k] __mod_lruvec_page_state 0.64% fio [kernel.kallsyms] [k] __mod_node_page_state 0.64% fio [kernel.kallsyms] [k] folio_add_lru 0.64% fio [kernel.kallsyms] [k] balance_dirty_pages_ratelimited_flags 0.62% fio [kernel.kallsyms] [k] __mod_memcg_lruvec_state 0.61% fio [kernel.kallsyms] [k] iomap_write_end 0.60% fio [kernel.kallsyms] [k] xas_find_conflict 0.59% fio [kernel.kallsyms] [k] bad_range 0.58% kworker/u98:2-e [kernel.kallsyms] [k] xas_load 0.57% fio [kernel.kallsyms] [k] iomap_file_buffered_write 0.56% kworker/u98:2-e [kernel.kallsyms] [k] percpu_counter_add_batch 0.49% fio [kernel.kallsyms] [k] __might_resched 'perf top -g -U' result: Samples: 78K of event 'cycles', 4000 Hz, Event count (approx.): 29400815085 lost: 0/0 drop: 0/0 Children Self Shared Object Symbol + 62.59% 0.03% [kernel] [k] entry_SYSCALL_64_after_hwframe + 60.15% 0.02% [kernel] [k] do_syscall_64 + 59.45% 0.02% [kernel] [k] vfs_write + 59.09% 0.54% [kernel] [k] iomap_file_buffered_write + 57.41% 0.00% [kernel] [k] ksys_write + 57.36% 0.01% [kernel] [k] xfs_file_buffered_write + 37.82% 0.00% libpthread-2.17.so [.] 0x00007fce6f20e6fd + 26.83% 1.20% [kernel] [k] iomap_write_begin + 24.65% 0.45% [kernel] [k] iomap_get_folio + 24.15% 0.74% [kernel] [k] __filemap_get_folio.part.69 + 20.17% 0.00% [kernel] [k] __writeback_single_inode + 20.17% 0.65% [kernel] [k] write_cache_pages + 17.66% 0.43% [kernel] [k] copy_page_from_iter_atomic + 17.18% 0.12% [kernel] [k] copyin + 17.08% 16.71% [kernel] [k] rep_movs_alternative + 16.97% 0.00% [kernel] [k] ret_from_fork + 16.97% 0.00% [kernel] [k] kthread + 16.87% 0.00% [kernel] [k] worker_thread + 16.84% 0.00% [kernel] [k] process_one_work + 14.86% 0.17% [kernel] [k] filemap_add_folio + 13.83% 0.77% [kernel] [k] iomap_writepage_map + 11.90% 0.33% [kernel] [k] iomap_finish_ioend + 11.57% 0.23% [kernel] [k] folio_end_writeback + 11.51% 0.73% [kernel] [k] iomap_write_end + 11.30% 2.14% [kernel] [k] __folio_end_writeback + 10.70% 0.00% [kernel] [k] wb_workfn + 10.70% 0.00% [kernel] [k] wb_writeback + 10.70% 0.00% [kernel] [k] __writeback_inodes_wb + 10.70% 0.00% [kernel] [k] writeback_sb_inodes + 10.70% 0.00% [kernel] [k] do_writepages + 10.70% 0.00% [kernel] [k] xfs_vm_writepages + 10.70% 0.00% [kernel] [k] iomap_writepages + 10.19% 2.68% [kernel] [k] _raw_spin_lock_irqsave + 10.17% 1.35% [kernel] [k] __filemap_add_folio + 9.94% 0.00% [unknown] [k] 0x0000000001942a70 + 9.94% 0.00% [unknown] [k] 0x0000000001942ac0 + 9.94% 0.00% [unknown] [k] 0x0000000001942b30 Best Regards Wang Yugui (wangyugui@e16-tech.com) 2023/05/20
On Sat, May 20, 2023 at 09:35:32PM +0800, Wang Yugui wrote: > test result of the second version of the patch. > > # dmesg |grep 'index\|suppressed' > [ 89.376149] index:0 len:292 order:2 > [ 97.862938] index:0 len:4096 order:2 > [ 98.340665] index:0 len:4096 order:2 > [ 98.346633] index:0 len:4096 order:2 > [ 98.352323] index:0 len:4096 order:2 > [ 98.359952] index:0 len:4096 order:2 > [ 98.364015] index:3 len:4096 order:2 > [ 98.368943] index:0 len:4096 order:2 > [ 98.374285] index:0 len:4096 order:2 > [ 98.379163] index:3 len:4096 order:2 > [ 98.384760] index:0 len:4096 order:2 > [ 181.103751] iomap_get_folio: 342 callbacks suppressed > [ 181.103761] index:0 len:292 order:2 Thanks. Clearly we're not creating large folios in the write path. I tracked it down, and used your fio command. My system creates 1MB folios, so I think yours will too. Patch series incoming (I fixed a couple of other oversights too).
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c739b258a2d9..3702e5e47b0f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) if (status) return ERR_PTR(status); - folio = iomap_get_folio(iter, pos); + folio = iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) gfs2_trans_end(sdp); return folio; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 063133ec77f4..21f33731617a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -461,16 +461,18 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); * iomap_get_folio - get a folio reference for writing * @iter: iteration structure * @pos: start offset of write + * @len: length of write * * Returns a locked reference to the folio at @pos, or an error pointer if the * folio could not be obtained. */ -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + fgp |= fgp_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); @@ -603,7 +605,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); else - return iomap_get_folio(iter, pos); + return iomap_get_folio(iter, pos, len); } static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e119..80facb9c9e5b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -261,7 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos); +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a56308a9d1a4..5d1341862c5d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -466,6 +466,19 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); #else @@ -505,14 +518,20 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 #define FGP_STABLE 0x00000080 +#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned fgp_order(size_t size) +{ + return get_order(size) << 26; +} + void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + unsigned fgp_flags, gfp_t gfp); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + unsigned fgp_flags, gfp_t gfp); /** * filemap_get_folio - Find and get a folio. @@ -586,7 +605,7 @@ static inline struct page *find_get_page(struct address_space *mapping, } static inline struct page *find_get_page_flags(struct address_space *mapping, - pgoff_t offset, int fgp_flags) + pgoff_t offset, unsigned fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..2eab5e6b6646 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1910,7 +1910,7 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + unsigned fgp_flags, gfp_t gfp) { struct folio *folio; @@ -1952,7 +1952,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { + unsigned order = fgp_order(fgp_flags); int err; + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) @@ -1961,26 +1963,38 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT | __GFP_NOWARN; } - - folio = filemap_alloc_folio(gfp, 0); - if (!folio) - return ERR_PTR(-ENOMEM); - if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoid atomic mark_page_accessed later */ - if (fgp_flags & FGP_ACCESSED) - __folio_set_referenced(folio); + if (order > MAX_PAGECACHE_ORDER) + order = MAX_PAGECACHE_ORDER; + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); - err = filemap_add_folio(mapping, folio, index, gfp); - if (unlikely(err)) { + do { + err = -ENOMEM; + if (order == 1) + order = 0; + folio = filemap_alloc_folio(gfp, order); + if (!folio) + continue; + + /* Init accessed so avoid atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + __folio_set_referenced(folio); + + err = filemap_add_folio(mapping, folio, index, gfp); + if (!err) + break; folio_put(folio); folio = NULL; - if (err == -EEXIST) - goto repeat; - } + } while (order-- > 0); + if (err == -EEXIST) + goto repeat; + if (err) + return ERR_PTR(err); /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. diff --git a/mm/folio-compat.c b/mm/folio-compat.c index c6f056c20503..c96e88d9a262 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + unsigned fgp_flags, gfp_t gfp) { struct folio *folio; diff --git a/mm/readahead.c b/mm/readahead.c index 47afbca1d122..59a071badb90 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -462,19 +462,6 @@ static int try_context_readahead(struct address_space *mapping, return 1; } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) {