Message ID | 20230520163603.1794256-2-willy@infradead.org (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | Create large folios in iomap buffered write path | expand |
Hi, > Allow callers of __filemap_get_folio() to specify a preferred folio > order in the FGP flags. This is only honoured in the FGP_CREATE path; > if there is already a folio in the page cache that covers the index, > we will return it, no matter what its order is. No create-around is > attempted; we will only create folios which start at the specified index. > Unmodified callers will continue to allocate order 0 folios. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > include/linux/pagemap.h | 29 ++++++++++++++++++++++++--- > mm/filemap.c | 44 ++++++++++++++++++++++++++++------------- > mm/folio-compat.c | 2 +- > mm/readahead.c | 13 ------------ > 4 files changed, 57 insertions(+), 31 deletions(-) > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index a56308a9d1a4..f4d05beb64eb 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -466,6 +466,19 @@ static inline void *detach_page_private(struct page *page) > return folio_detach_private(page_folio(page)); > } > > +/* > + * There are some parts of the kernel which assume that PMD entries > + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, > + * limit the maximum allocation order to PMD size. I'm not aware of any > + * assumptions about maximum order if THP are disabled, but 8 seems like > + * a good order (that's 1MB if you're using 4kB pages) > + */ > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER > +#else > +#define MAX_PAGECACHE_ORDER 8 > +#endif > + > #ifdef CONFIG_NUMA > struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); > #else > @@ -505,14 +518,24 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, > #define FGP_NOWAIT 0x00000020 > #define FGP_FOR_MMAP 0x00000040 > #define FGP_STABLE 0x00000080 > +#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */ > > #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) > > +static inline unsigned fgp_order(size_t size) > +{ > + unsigned int shift = ilog2(size); > + > + if (shift <= PAGE_SHIFT) > + return 0; > + return (shift - PAGE_SHIFT) << 26; int overflow will happen when size > 0.5M(2**19)? Best Regards Wang Yugui (wangyugui@e16-tech.com) 2023/05/21 > +} > + > void *filemap_get_entry(struct address_space *mapping, pgoff_t index); > struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, > - int fgp_flags, gfp_t gfp); > + unsigned fgp_flags, gfp_t gfp); > struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, > - int fgp_flags, gfp_t gfp); > + unsigned fgp_flags, gfp_t gfp); > > /** > * filemap_get_folio - Find and get a folio. > @@ -586,7 +609,7 @@ static inline struct page *find_get_page(struct address_space *mapping, > } > > static inline struct page *find_get_page_flags(struct address_space *mapping, > - pgoff_t offset, int fgp_flags) > + pgoff_t offset, unsigned fgp_flags) > { > return pagecache_get_page(mapping, offset, fgp_flags, 0); > } > diff --git a/mm/filemap.c b/mm/filemap.c > index b4c9bd368b7e..5935c7aac388 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -1910,7 +1910,7 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) > * Return: The found folio or an ERR_PTR() otherwise. > */ > struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, > - int fgp_flags, gfp_t gfp) > + unsigned fgp_flags, gfp_t gfp) > { > struct folio *folio; > > @@ -1952,7 +1952,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, > folio_wait_stable(folio); > no_page: > if (!folio && (fgp_flags & FGP_CREAT)) { > + unsigned order = FGP_ORDER(fgp_flags); > int err; > + > if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) > gfp |= __GFP_WRITE; > if (fgp_flags & FGP_NOFS) > @@ -1961,26 +1963,40 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, > gfp &= ~GFP_KERNEL; > gfp |= GFP_NOWAIT | __GFP_NOWARN; > } > - > - folio = filemap_alloc_folio(gfp, 0); > - if (!folio) > - return ERR_PTR(-ENOMEM); > - > if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) > fgp_flags |= FGP_LOCK; > > - /* Init accessed so avoid atomic mark_page_accessed later */ > - if (fgp_flags & FGP_ACCESSED) > - __folio_set_referenced(folio); > + if (!mapping_large_folio_support(mapping)) > + order = 0; > + if (order > MAX_PAGECACHE_ORDER) > + order = MAX_PAGECACHE_ORDER; > + /* If we're not aligned, allocate a smaller folio */ > + if (index & ((1UL << order) - 1)) > + order = __ffs(index); > > - err = filemap_add_folio(mapping, folio, index, gfp); > - if (unlikely(err)) { > + do { > + err = -ENOMEM; > + if (order == 1) > + order = 0; > + folio = filemap_alloc_folio(gfp, order); > + if (!folio) > + continue; > + > + /* Init accessed so avoid atomic mark_page_accessed later */ > + if (fgp_flags & FGP_ACCESSED) > + __folio_set_referenced(folio); > + > + err = filemap_add_folio(mapping, folio, index, gfp); > + if (!err) > + break; > folio_put(folio); > folio = NULL; > - if (err == -EEXIST) > - goto repeat; > - } > + } while (order-- > 0); > > + if (err == -EEXIST) > + goto repeat; > + if (err) > + return ERR_PTR(err); > /* > * filemap_add_folio locks the page, and for mmap > * we expect an unlocked page. > diff --git a/mm/folio-compat.c b/mm/folio-compat.c > index c6f056c20503..c96e88d9a262 100644 > --- a/mm/folio-compat.c > +++ b/mm/folio-compat.c > @@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru); > > noinline > struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, > - int fgp_flags, gfp_t gfp) > + unsigned fgp_flags, gfp_t gfp) > { > struct folio *folio; > > diff --git a/mm/readahead.c b/mm/readahead.c > index 47afbca1d122..59a071badb90 100644 > --- a/mm/readahead.c > +++ b/mm/readahead.c > @@ -462,19 +462,6 @@ static int try_context_readahead(struct address_space *mapping, > return 1; > } > > -/* > - * There are some parts of the kernel which assume that PMD entries > - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, > - * limit the maximum allocation order to PMD size. I'm not aware of any > - * assumptions about maximum order if THP are disabled, but 8 seems like > - * a good order (that's 1MB if you're using 4kB pages) > - */ > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE > -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER > -#else > -#define MAX_PAGECACHE_ORDER 8 > -#endif > - > static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, > pgoff_t mark, unsigned int order, gfp_t gfp) > { > -- > 2.39.2
On Sun, May 21, 2023 at 09:02:36AM +0800, Wang Yugui wrote: > > +static inline unsigned fgp_order(size_t size) > > +{ > > + unsigned int shift = ilog2(size); > > + > > + if (shift <= PAGE_SHIFT) > > + return 0; > > + return (shift - PAGE_SHIFT) << 26; > > int overflow will happen when size > 0.5M(2**19)? I don't see it? size == 1 << 20; shift = 20; return (20 - 12) << 26; Looks like about 1 << 29 to me.
Hi, > On Sun, May 21, 2023 at 09:02:36AM +0800, Wang Yugui wrote: > > > +static inline unsigned fgp_order(size_t size) > > > +{ > > > + unsigned int shift = ilog2(size); > > > + > > > + if (shift <= PAGE_SHIFT) > > > + return 0; > > > + return (shift - PAGE_SHIFT) << 26; > > > > int overflow will happen when size > 0.5M(2**19)? > > I don't see it? > > size == 1 << 20; > > shift = 20; > return (20 - 12) << 26; > > Looks like about 1 << 29 to me. sorry that I wrongly 1) wrongly conside PAGE_SHIFT as 13 from arch/alpha/include/asm/page.h it should be 12 from arch/x86/include/asm/page_types.h. 2) wrongly conside (20 - 12) << 26 as 1<< (20 - 12) << 26 Best Regards Wang Yugui (wangyugui@e16-tech.com) 2023/05/21
On Sat, May 20, 2023 at 05:36:01PM +0100, Matthew Wilcox (Oracle) wrote: > Allow callers of __filemap_get_folio() to specify a preferred folio > order in the FGP flags. This is only honoured in the FGP_CREATE path; > if there is already a folio in the page cache that covers the index, > we will return it, no matter what its order is. No create-around is > attempted; we will only create folios which start at the specified index. > Unmodified callers will continue to allocate order 0 folios. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > include/linux/pagemap.h | 29 ++++++++++++++++++++++++--- > mm/filemap.c | 44 ++++++++++++++++++++++++++++------------- > mm/folio-compat.c | 2 +- > mm/readahead.c | 13 ------------ > 4 files changed, 57 insertions(+), 31 deletions(-) > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index a56308a9d1a4..f4d05beb64eb 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -466,6 +466,19 @@ static inline void *detach_page_private(struct page *page) > return folio_detach_private(page_folio(page)); > } > > +/* > + * There are some parts of the kernel which assume that PMD entries > + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, > + * limit the maximum allocation order to PMD size. I'm not aware of any > + * assumptions about maximum order if THP are disabled, but 8 seems like > + * a good order (that's 1MB if you're using 4kB pages) > + */ > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER > +#else > +#define MAX_PAGECACHE_ORDER 8 > +#endif > + > #ifdef CONFIG_NUMA > struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); > #else > @@ -505,14 +518,24 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, > #define FGP_NOWAIT 0x00000020 > #define FGP_FOR_MMAP 0x00000040 > #define FGP_STABLE 0x00000080 > +#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */ > > #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) > > +static inline unsigned fgp_order(size_t size) > +{ > + unsigned int shift = ilog2(size); > + > + if (shift <= PAGE_SHIFT) > + return 0; > + return (shift - PAGE_SHIFT) << 26; > +} Doesn't check for being larger than MAX_PAGECACHE_ORDER. Also: naming. FGP_ORDER(fgp) to get the order stored in the fgp, fgp_order(size) to get the order from the IO length. Both are integers, the compiler is not going to tell us when we get them the wrong way around, and it's impossible to determine which one is right just from looking at the code. Perhaps fgp_order_from_flags(fgp) and fgp_order_from_length(size)? Also, why put the order in the high bits? Shifting integers up into unaligned high bits is prone to sign extension issues and overflows. e.g. fgp_flags is passed around the filemap functions as a signed integer, so using the high bit in a shifted value that is unsigned seems like a recipe for unexpected sign extension bugs on extraction. Hence I'd much prefer low bits are used for this sort of integer encoding (i.e. uses masks instead of shifts for extraction), and that flags fields -always- use unsigned variables so high bit usage doesn't unexpected do the wrong thing.... Cheers, Dave.
On Sun, May 21, 2023 at 12:13:57PM +1000, Dave Chinner wrote: > > +static inline unsigned fgp_order(size_t size) > > +{ > > + unsigned int shift = ilog2(size); > > + > > + if (shift <= PAGE_SHIFT) > > + return 0; > > + return (shift - PAGE_SHIFT) << 26; > > +} > > Doesn't check for being larger than MAX_PAGECACHE_ORDER. It doesn't need to. I check it on extraction. We've got six bits, so we can't overflow it. > Also: naming. FGP_ORDER(fgp) to get the order stored in the fgp, > fgp_order(size) to get the order from the IO length. > > Both are integers, the compiler is not going to tell us when we get > them the wrong way around, and it's impossible to determine which > one is right just from looking at the code. > > Perhaps fgp_order_from_flags(fgp) and fgp_order_from_length(size)? Yeah, I don't like that either. I could be talked into fgp_set_order(size) and fgp_get_order(fgp). Also we should type the FGP flags like we type the GFP flags. > Also, why put the order in the high bits? Shifting integers up into > unaligned high bits is prone to sign extension issues and overflows. > e.g. fgp_flags is passed around the filemap functions as a signed > integer, so using the high bit in a shifted value that is unsigned > seems like a recipe for unexpected sign extension bugs on > extraction. As long as it's an unsigned int in the function which does the extraction, there's no problem. It's also kind of hard to set the top bit -- you'd have to somehow get a 2^44 byte write into iomap. > Hence I'd much prefer low bits are used for this sort of integer > encoding (i.e. uses masks instead of shifts for extraction), and > that flags fields -always- use unsigned variables so high bit > usage doesn't unexpected do the wrong thing.... There are some encoding advantages to using low bits for flags. Does depend on the architecture; x86 is particularly prone to this kind of thing, but ARM has various constraints on what constants it can represent as immediates. I've rarely had cause to care about other architecture details, but generally low bits are better supported as flags than high bits.
On Sun, May 21, 2023 at 10:04:35AM +0800, Wang Yugui wrote: > > I don't see it? > > > > size == 1 << 20; > > > > shift = 20; > > return (20 - 12) << 26; > > > > Looks like about 1 << 29 to me. > > sorry that I wrongly > 1) wrongly conside PAGE_SHIFT as 13 from arch/alpha/include/asm/page.h > it should be 12 from arch/x86/include/asm/page_types.h. > > 2) wrongly conside > (20 - 12) << 26 > as > 1<< (20 - 12) << 26 Ah, no problem. Glad I didn't miss something.
On Sat, May 20, 2023 at 05:36:01PM +0100, Matthew Wilcox (Oracle) wrote:
> +#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */
Why don't we just add a new argument for the order?
On Mon, May 22, 2023 at 10:59:17PM -0700, Christoph Hellwig wrote: > On Sat, May 20, 2023 at 05:36:01PM +0100, Matthew Wilcox (Oracle) wrote: > > +#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */ > > Why don't we just add a new argument for the order? Because it already takes four arguments and has dozens of callers, most of which would have the uninformative '0' added to them?
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a56308a9d1a4..f4d05beb64eb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -466,6 +466,19 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); #else @@ -505,14 +518,24 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 #define FGP_STABLE 0x00000080 +#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned fgp_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + return (shift - PAGE_SHIFT) << 26; +} + void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + unsigned fgp_flags, gfp_t gfp); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + unsigned fgp_flags, gfp_t gfp); /** * filemap_get_folio - Find and get a folio. @@ -586,7 +609,7 @@ static inline struct page *find_get_page(struct address_space *mapping, } static inline struct page *find_get_page_flags(struct address_space *mapping, - pgoff_t offset, int fgp_flags) + pgoff_t offset, unsigned fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..5935c7aac388 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1910,7 +1910,7 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + unsigned fgp_flags, gfp_t gfp) { struct folio *folio; @@ -1952,7 +1952,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { + unsigned order = FGP_ORDER(fgp_flags); int err; + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) @@ -1961,26 +1963,40 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT | __GFP_NOWARN; } - - folio = filemap_alloc_folio(gfp, 0); - if (!folio) - return ERR_PTR(-ENOMEM); - if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoid atomic mark_page_accessed later */ - if (fgp_flags & FGP_ACCESSED) - __folio_set_referenced(folio); + if (!mapping_large_folio_support(mapping)) + order = 0; + if (order > MAX_PAGECACHE_ORDER) + order = MAX_PAGECACHE_ORDER; + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); - err = filemap_add_folio(mapping, folio, index, gfp); - if (unlikely(err)) { + do { + err = -ENOMEM; + if (order == 1) + order = 0; + folio = filemap_alloc_folio(gfp, order); + if (!folio) + continue; + + /* Init accessed so avoid atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + __folio_set_referenced(folio); + + err = filemap_add_folio(mapping, folio, index, gfp); + if (!err) + break; folio_put(folio); folio = NULL; - if (err == -EEXIST) - goto repeat; - } + } while (order-- > 0); + if (err == -EEXIST) + goto repeat; + if (err) + return ERR_PTR(err); /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. diff --git a/mm/folio-compat.c b/mm/folio-compat.c index c6f056c20503..c96e88d9a262 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + unsigned fgp_flags, gfp_t gfp) { struct folio *folio; diff --git a/mm/readahead.c b/mm/readahead.c index 47afbca1d122..59a071badb90 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -462,19 +462,6 @@ static int try_context_readahead(struct address_space *mapping, return 1; } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) {
Allow callers of __filemap_get_folio() to specify a preferred folio order in the FGP flags. This is only honoured in the FGP_CREATE path; if there is already a folio in the page cache that covers the index, we will return it, no matter what its order is. No create-around is attempted; we will only create folios which start at the specified index. Unmodified callers will continue to allocate order 0 folios. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> --- include/linux/pagemap.h | 29 ++++++++++++++++++++++++--- mm/filemap.c | 44 ++++++++++++++++++++++++++++------------- mm/folio-compat.c | 2 +- mm/readahead.c | 13 ------------ 4 files changed, 57 insertions(+), 31 deletions(-)