Message ID | 20240213093713.1753368-2-kernel@pankajraghav.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | enable bs > ps in XFS | expand |
On 2/13/24 10:37, Pankaj Raghav (Samsung) wrote: > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > Some filesystems want to be able to limit the maximum size of folios, > and some want to be able to ensure that folios are at least a certain > size. Add mapping_set_folio_orders() to allow this level of control. > The max folio order parameter is ignored and it is always set to > MAX_PAGECACHE_ORDER. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > Signed-off-by: Luis Chamberlain <mcgrof@kernel.org> > --- > include/linux/pagemap.h | 92 ++++++++++++++++++++++++++++++++--------- > 1 file changed, 73 insertions(+), 19 deletions(-) > Reviewed-by: Hannes Reinecke <hare@suse.de> Cheers, Hannes
On Tue, Feb 13, 2024 at 10:37:00AM +0100, Pankaj Raghav (Samsung) wrote: > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > Some filesystems want to be able to limit the maximum size of folios, > and some want to be able to ensure that folios are at least a certain > size. Add mapping_set_folio_orders() to allow this level of control. > The max folio order parameter is ignored and it is always set to > MAX_PAGECACHE_ORDER. Why? If MAX_PAGECACHE_ORDER is 8 and I instead pass in max==3, I'm going to be surprised by my constraint being ignored. Maybe I said that because I'm not prepared to handle an order-7 folio; or some customer will have some weird desire to twist this knob to make their workflow faster. --D > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > Signed-off-by: Luis Chamberlain <mcgrof@kernel.org> > --- > include/linux/pagemap.h | 92 ++++++++++++++++++++++++++++++++--------- > 1 file changed, 73 insertions(+), 19 deletions(-) > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index 2df35e65557d..5618f762187b 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -202,13 +202,18 @@ enum mapping_flags { > AS_EXITING = 4, /* final truncate in progress */ > /* writeback related tags are not used */ > AS_NO_WRITEBACK_TAGS = 5, > - AS_LARGE_FOLIO_SUPPORT = 6, > - AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ > - AS_STABLE_WRITES, /* must wait for writeback before modifying > + AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ > + AS_STABLE_WRITES = 7, /* must wait for writeback before modifying > folio contents */ > - AS_UNMOVABLE, /* The mapping cannot be moved, ever */ > + AS_FOLIO_ORDER_MIN = 8, > + AS_FOLIO_ORDER_MAX = 13, /* Bit 8-17 are used for FOLIO_ORDER */ > + AS_UNMOVABLE = 18, /* The mapping cannot be moved, ever */ > }; > > +#define AS_FOLIO_ORDER_MIN_MASK 0x00001f00 > +#define AS_FOLIO_ORDER_MAX_MASK 0x0003e000 > +#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) > + > /** > * mapping_set_error - record a writeback error in the address_space > * @mapping: the mapping in which an error should be set > @@ -344,6 +349,53 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) > m->gfp_mask = mask; > } > > +/* > + * There are some parts of the kernel which assume that PMD entries > + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, > + * limit the maximum allocation order to PMD size. I'm not aware of any > + * assumptions about maximum order if THP are disabled, but 8 seems like > + * a good order (that's 1MB if you're using 4kB pages) > + */ > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER > +#else > +#define MAX_PAGECACHE_ORDER 8 > +#endif > + > +/* > + * mapping_set_folio_orders() - Set the range of folio sizes supported. > + * @mapping: The file. > + * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). > + * @max: Maximum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). > + * > + * The filesystem should call this function in its inode constructor to > + * indicate which sizes of folio the VFS can use to cache the contents > + * of the file. This should only be used if the filesystem needs special > + * handling of folio sizes (ie there is something the core cannot know). > + * Do not tune it based on, eg, i_size. > + * > + * Context: This should not be called while the inode is active as it > + * is non-atomic. > + */ > +static inline void mapping_set_folio_orders(struct address_space *mapping, > + unsigned int min, unsigned int max) > +{ > + if (min == 1) > + min = 2; > + if (max < min) > + max = min; > + if (max > MAX_PAGECACHE_ORDER) > + max = MAX_PAGECACHE_ORDER; > + > + /* > + * XXX: max is ignored as only minimum folio order is supported > + * currently. > + */ > + mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | > + (min << AS_FOLIO_ORDER_MIN) | > + (MAX_PAGECACHE_ORDER << AS_FOLIO_ORDER_MAX); > +} > + > /** > * mapping_set_large_folios() - Indicate the file supports large folios. > * @mapping: The file. > @@ -357,7 +409,22 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) > */ > static inline void mapping_set_large_folios(struct address_space *mapping) > { > - __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); > + mapping_set_folio_orders(mapping, 0, MAX_PAGECACHE_ORDER); > +} > + > +static inline unsigned int mapping_max_folio_order(struct address_space *mapping) > +{ > + return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; > +} > + > +static inline unsigned int mapping_min_folio_order(struct address_space *mapping) > +{ > + return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; > +} > + > +static inline unsigned int mapping_min_folio_nrpages(struct address_space *mapping) > +{ > + return 1U << mapping_min_folio_order(mapping); > } > > /* > @@ -367,7 +434,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping) > static inline bool mapping_large_folio_support(struct address_space *mapping) > { > return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && > - test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); > + (mapping_max_folio_order(mapping) > 0); > } > > static inline int filemap_nr_thps(struct address_space *mapping) > @@ -528,19 +595,6 @@ static inline void *detach_page_private(struct page *page) > return folio_detach_private(page_folio(page)); > } > > -/* > - * There are some parts of the kernel which assume that PMD entries > - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, > - * limit the maximum allocation order to PMD size. I'm not aware of any > - * assumptions about maximum order if THP are disabled, but 8 seems like > - * a good order (that's 1MB if you're using 4kB pages) > - */ > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE > -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER > -#else > -#define MAX_PAGECACHE_ORDER 8 > -#endif > - > #ifdef CONFIG_NUMA > struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); > #else > -- > 2.43.0 > >
On Tue, Feb 13, 2024 at 08:34:31AM -0800, Darrick J. Wong wrote: > On Tue, Feb 13, 2024 at 10:37:00AM +0100, Pankaj Raghav (Samsung) wrote: > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > > > Some filesystems want to be able to limit the maximum size of folios, > > and some want to be able to ensure that folios are at least a certain > > size. Add mapping_set_folio_orders() to allow this level of control. > > The max folio order parameter is ignored and it is always set to > > MAX_PAGECACHE_ORDER. > > Why? If MAX_PAGECACHE_ORDER is 8 and I instead pass in max==3, I'm > going to be surprised by my constraint being ignored. Maybe I said that > because I'm not prepared to handle an order-7 folio; or some customer > will have some weird desire to twist this knob to make their workflow > faster. > > --D Maybe I should have been explicit. We are planning to add support for min order in the first round, and we want to add support for max order once the min order support is upstreamed. It was done mainly to reduce the scope and testing of this series. I definitely agree there are usecases for setting the max order. It is also the feedback we got from LPC. So one idea would be not to expose max option until we add the support for max order? So filesystems can only set the min_order with the initial support? > > +static inline void mapping_set_folio_orders(struct address_space *mapping, > > + unsigned int min, unsigned int max) > > +{ > > + if (min == 1) > > + min = 2; > > + if (max < min) > > + max = min; > > + if (max > MAX_PAGECACHE_ORDER) > > + max = MAX_PAGECACHE_ORDER; > > + > > + /* > > + * XXX: max is ignored as only minimum folio order is supported > > + * currently. > > + */ > > + mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | > > + (min << AS_FOLIO_ORDER_MIN) | > > + (MAX_PAGECACHE_ORDER << AS_FOLIO_ORDER_MAX); > > +} > > +
On Tue, Feb 13, 2024 at 10:05:54PM +0100, Pankaj Raghav (Samsung) wrote: > On Tue, Feb 13, 2024 at 08:34:31AM -0800, Darrick J. Wong wrote: > > On Tue, Feb 13, 2024 at 10:37:00AM +0100, Pankaj Raghav (Samsung) wrote: > > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > > > > > Some filesystems want to be able to limit the maximum size of folios, > > > and some want to be able to ensure that folios are at least a certain > > > size. Add mapping_set_folio_orders() to allow this level of control. > > > The max folio order parameter is ignored and it is always set to > > > MAX_PAGECACHE_ORDER. > > > > Why? If MAX_PAGECACHE_ORDER is 8 and I instead pass in max==3, I'm > > going to be surprised by my constraint being ignored. Maybe I said that > > because I'm not prepared to handle an order-7 folio; or some customer > > will have some weird desire to twist this knob to make their workflow > > faster. > > > > --D > Maybe I should have been explicit. We are planning to add support > for min order in the first round, and we want to add support for max order > once the min order support is upstreamed. It was done mainly to reduce > the scope and testing of this series. > > I definitely agree there are usecases for setting the max order. It is > also the feedback we got from LPC. > > So one idea would be not to expose max option until we add the support > for max order? So filesystems can only set the min_order with the > initial support? Yeah, there's really no point in having an argument that's deliberately ignored. --D > > > +static inline void mapping_set_folio_orders(struct address_space *mapping, > > > + unsigned int min, unsigned int max) > > > +{ > > > + if (min == 1) > > > + min = 2; > > > + if (max < min) > > > + max = min; > > > + if (max > MAX_PAGECACHE_ORDER) > > > + max = MAX_PAGECACHE_ORDER; > > > + > > > + /* > > > + * XXX: max is ignored as only minimum folio order is supported > > > + * currently. > > > + */ > > > + mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | > > > + (min << AS_FOLIO_ORDER_MIN) | > > > + (MAX_PAGECACHE_ORDER << AS_FOLIO_ORDER_MAX); > > > +} > > > + >
On Tue, Feb 13, 2024 at 10:37:00AM +0100, Pankaj Raghav (Samsung) wrote: > +static inline void mapping_set_folio_orders(struct address_space *mapping, > + unsigned int min, unsigned int max) > +{ > + if (min == 1) > + min = 2; If you order the "support order-1 folios" patch first, you can drop these two lines. > +static inline unsigned int mapping_min_folio_nrpages(struct address_space *mapping) I'm not sure if you need this, but it should return unsigned long, not unsigned int. With 64KiB pages on Arm, a PMD page is 512MiB (order 13) and a PUD page will be order 26, which is far too close to 2^32 for my comfort.
On Tue, Feb 13, 2024 at 01:29:14PM -0800, Darrick J. Wong wrote: > On Tue, Feb 13, 2024 at 10:05:54PM +0100, Pankaj Raghav (Samsung) wrote: > > On Tue, Feb 13, 2024 at 08:34:31AM -0800, Darrick J. Wong wrote: > > > On Tue, Feb 13, 2024 at 10:37:00AM +0100, Pankaj Raghav (Samsung) wrote: > > > > From: "Matthew Wilcox (Oracle)" <willy@infradead.org> > > > > > > > > Some filesystems want to be able to limit the maximum size of folios, > > > > and some want to be able to ensure that folios are at least a certain > > > > size. Add mapping_set_folio_orders() to allow this level of control. > > > > The max folio order parameter is ignored and it is always set to > > > > MAX_PAGECACHE_ORDER. > > > > > > Why? If MAX_PAGECACHE_ORDER is 8 and I instead pass in max==3, I'm > > > going to be surprised by my constraint being ignored. Maybe I said that > > > because I'm not prepared to handle an order-7 folio; or some customer > > > will have some weird desire to twist this knob to make their workflow > > > faster. > > > > > > --D > > Maybe I should have been explicit. We are planning to add support > > for min order in the first round, and we want to add support for max order > > once the min order support is upstreamed. It was done mainly to reduce > > the scope and testing of this series. > > > > I definitely agree there are usecases for setting the max order. It is > > also the feedback we got from LPC. > > > > So one idea would be not to expose max option until we add the support > > for max order? So filesystems can only set the min_order with the > > initial support? > > Yeah, there's really no point in having an argument that's deliberately > ignored. I favour introducing the right APIs even if they're not fully implemented. We have no filesystems today that need this, so it doesn't need to be implemented, but if we have to go back and add it, it's more churn for every filesystem. I'm open to better ideas about the API; I think for a lot of filesystems they only want to set the minimum, so maybe introducing that API now would be a good thing.
On Wed, Feb 14, 2024 at 06:49:49PM +0000, Matthew Wilcox wrote: > On Tue, Feb 13, 2024 at 10:37:00AM +0100, Pankaj Raghav (Samsung) wrote: > > +static inline void mapping_set_folio_orders(struct address_space *mapping, > > + unsigned int min, unsigned int max) > > +{ > > + if (min == 1) > > + min = 2; > > If you order the "support order-1 folios" patch first, you can drop > these two lines. > Thanks for pointing this out. I actually forgot to update this later in my series. The only failure I was noticing for LBS in 8k block sizes (generic/630) gets fixed by this change as well :) . > > +static inline unsigned int mapping_min_folio_nrpages(struct address_space *mapping) > > I'm not sure if you need this, but it should return unsigned long, not > unsigned int. With 64KiB pages on Arm, a PMD page is 512MiB (order 13) > and a PUD page will be order 26, which is far too close to 2^32 for > my comfort. There were some suggestions from Chinner which might make this function go away. But in case I need it, I will update it to unsigned long to be on the safe side.
> > > Maybe I should have been explicit. We are planning to add support > > > for min order in the first round, and we want to add support for max order > > > once the min order support is upstreamed. It was done mainly to reduce > > > the scope and testing of this series. > > > > > > I definitely agree there are usecases for setting the max order. It is > > > also the feedback we got from LPC. > > > > > > So one idea would be not to expose max option until we add the support > > > for max order? So filesystems can only set the min_order with the > > > initial support? > > > > Yeah, there's really no point in having an argument that's deliberately > > ignored. > > I favour introducing the right APIs even if they're not fully implemented. > We have no filesystems today that need this, so it doesn't need to > be implemented, but if we have to go back and add it, it's more churn > for every filesystem. I'm open to better ideas about the API; I think > for a lot of filesystems they only want to set the minimum, so maybe > introducing that API now would be a good thing. I will introduce a new API that only exposes the min order for now. I agree with you that I don't see a lot of filesystems other than XFS using this in the near future. We deduce min order based on the filesystem blocksize but we don't have any mechanisms in place from userspace to set the max order for a filesystem. So that also needs to be thought through and discussed with the community. I hope to start working on max_order immediately after upstreaming the min_order feature. -- Pankaj
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2df35e65557d..5618f762187b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -202,13 +202,18 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, - AS_LARGE_FOLIO_SUPPORT = 6, - AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ - AS_STABLE_WRITES, /* must wait for writeback before modifying + AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ + AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ - AS_UNMOVABLE, /* The mapping cannot be moved, ever */ + AS_FOLIO_ORDER_MIN = 8, + AS_FOLIO_ORDER_MAX = 13, /* Bit 8-17 are used for FOLIO_ORDER */ + AS_UNMOVABLE = 18, /* The mapping cannot be moved, ever */ }; +#define AS_FOLIO_ORDER_MIN_MASK 0x00001f00 +#define AS_FOLIO_ORDER_MAX_MASK 0x0003e000 +#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) + /** * mapping_set_error - record a writeback error in the address_space * @mapping: the mapping in which an error should be set @@ -344,6 +349,53 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + +/* + * mapping_set_folio_orders() - Set the range of folio sizes supported. + * @mapping: The file. + * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). + * @max: Maximum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). + * + * The filesystem should call this function in its inode constructor to + * indicate which sizes of folio the VFS can use to cache the contents + * of the file. This should only be used if the filesystem needs special + * handling of folio sizes (ie there is something the core cannot know). + * Do not tune it based on, eg, i_size. + * + * Context: This should not be called while the inode is active as it + * is non-atomic. + */ +static inline void mapping_set_folio_orders(struct address_space *mapping, + unsigned int min, unsigned int max) +{ + if (min == 1) + min = 2; + if (max < min) + max = min; + if (max > MAX_PAGECACHE_ORDER) + max = MAX_PAGECACHE_ORDER; + + /* + * XXX: max is ignored as only minimum folio order is supported + * currently. + */ + mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | + (min << AS_FOLIO_ORDER_MIN) | + (MAX_PAGECACHE_ORDER << AS_FOLIO_ORDER_MAX); +} + /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The file. @@ -357,7 +409,22 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) */ static inline void mapping_set_large_folios(struct address_space *mapping) { - __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); + mapping_set_folio_orders(mapping, 0, MAX_PAGECACHE_ORDER); +} + +static inline unsigned int mapping_max_folio_order(struct address_space *mapping) +{ + return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; +} + +static inline unsigned int mapping_min_folio_order(struct address_space *mapping) +{ + return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; +} + +static inline unsigned int mapping_min_folio_nrpages(struct address_space *mapping) +{ + return 1U << mapping_min_folio_order(mapping); } /* @@ -367,7 +434,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping) static inline bool mapping_large_folio_support(struct address_space *mapping) { return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); + (mapping_max_folio_order(mapping) > 0); } static inline int filemap_nr_thps(struct address_space *mapping) @@ -528,19 +595,6 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); #else