diff mbox series

[v8,01/10] fs: Allow fine-grained control of folio sizes

Message ID 20240625114420.719014-2-kernel@pankajraghav.com (mailing list archive)
State New
Headers show
Series enable bs > ps in XFS | expand

Commit Message

Pankaj Raghav (Samsung) June 25, 2024, 11:44 a.m. UTC
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>

We need filesystems to be able to communicate acceptable folio sizes
to the pagecache for a variety of uses (e.g. large block sizes).
Support a range of folio sizes between order-0 and order-31.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/pagemap.h | 86 ++++++++++++++++++++++++++++++++++-------
 mm/filemap.c            |  6 +--
 mm/readahead.c          |  4 +-
 3 files changed, 77 insertions(+), 19 deletions(-)

Comments

Ryan Roberts July 4, 2024, 12:23 p.m. UTC | #1
Hi,

Here are some drive-by review comments as I'm evaluating whether these patches
can help me with what I'm trying to do at
https://lore.kernel.org/linux-mm/20240215154059.2863126-1-ryan.roberts@arm.com/...


On 25/06/2024 12:44, Pankaj Raghav (Samsung) wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> We need filesystems to be able to communicate acceptable folio sizes
> to the pagecache for a variety of uses (e.g. large block sizes).
> Support a range of folio sizes between order-0 and order-31.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  include/linux/pagemap.h | 86 ++++++++++++++++++++++++++++++++++-------
>  mm/filemap.c            |  6 +--
>  mm/readahead.c          |  4 +-
>  3 files changed, 77 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 4b71d581091f..0c51154cdb57 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -204,14 +204,21 @@ enum mapping_flags {
>  	AS_EXITING	= 4, 	/* final truncate in progress */
>  	/* writeback related tags are not used */
>  	AS_NO_WRITEBACK_TAGS = 5,
> -	AS_LARGE_FOLIO_SUPPORT = 6,

nit: this removed enum is still referenced in a comment further down the file.

> -	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
> -	AS_STABLE_WRITES,	/* must wait for writeback before modifying
> +	AS_RELEASE_ALWAYS = 6,	/* Call ->release_folio(), even if no private data */
> +	AS_STABLE_WRITES = 7,	/* must wait for writeback before modifying
>  				   folio contents */
> -	AS_UNMOVABLE,		/* The mapping cannot be moved, ever */
> -	AS_INACCESSIBLE,	/* Do not attempt direct R/W access to the mapping */
> +	AS_UNMOVABLE = 8,	/* The mapping cannot be moved, ever */
> +	AS_INACCESSIBLE = 9,	/* Do not attempt direct R/W access to the mapping */
> +	/* Bits 16-25 are used for FOLIO_ORDER */
> +	AS_FOLIO_ORDER_BITS = 5,
> +	AS_FOLIO_ORDER_MIN = 16,
> +	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,

nit: These 3 new enums seem a bit odd. It might be clearer if you just reserve
the bits for the fields here? AS_FOLIO_ORDER_BITS isn't actually a flags bit and
the MAX value is currently the start of the max field, not the end.

#define AS_FOLIO_ORDER_BITS 5

enum mapping_flags {
	...
	AS_FOLIO_ORDERS_FIRST = 16,
	AS_FOLIO_ORDERS_LAST = AS_FOLIO_ORDERS_FIRST+(2*AS_FOLIO_ORDER_BITS)-1,
	...
};

#define AS_FOLIO_ORDER_MIN_MASK \
	GENMASK(AS_FOLIO_ORDERS_FIRST + AS_FOLIO_ORDER_BITS - 1, \
		AS_FOLIO_ORDERS_FIRST)
#define AS_FOLIO_ORDER_MAX_MASK \
	GENMASK(AS_FOLIO_ORDERS_LAST, \
		AS_FOLIO_ORDERS_LAST - AS_FOLIO_ORDER_BITS + 1)

>  };
>  
> +#define AS_FOLIO_ORDER_MASK     ((1u << AS_FOLIO_ORDER_BITS) - 1)
> +#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_MASK << AS_FOLIO_ORDER_MIN)
> +#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_MASK << AS_FOLIO_ORDER_MAX)
> +
>  /**
>   * mapping_set_error - record a writeback error in the address_space
>   * @mapping: the mapping in which an error should be set
> @@ -360,9 +367,49 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
>  #define MAX_PAGECACHE_ORDER	8
>  #endif
>  
> +/*
> + * mapping_set_folio_order_range() - Set the orders supported by a file.
> + * @mapping: The address space of the file.
> + * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
> + * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
> + *
> + * The filesystem should call this function in its inode constructor to
> + * indicate which base size (min) and maximum size (max) of folio the VFS
> + * can use to cache the contents of the file.  This should only be used
> + * if the filesystem needs special handling of folio sizes (ie there is
> + * something the core cannot know).
> + * Do not tune it based on, eg, i_size.
> + *
> + * Context: This should not be called while the inode is active as it
> + * is non-atomic.
> + */
> +static inline void mapping_set_folio_order_range(struct address_space *mapping,
> +						 unsigned int min,
> +						 unsigned int max)
> +{
> +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> +		return;
> +
> +	if (min > MAX_PAGECACHE_ORDER)
> +		min = MAX_PAGECACHE_ORDER;
> +	if (max > MAX_PAGECACHE_ORDER)
> +		max = MAX_PAGECACHE_ORDER;
> +	if (max < min)
> +		max = min;

It seems strange to silently clamp these? Presumably for the bs>ps usecase,
whatever values are passed in are a hard requirement? So wouldn't want them to
be silently reduced. (Especially given the recent change to reduce the size of
MAX_PAGECACHE_ORDER to less then PMD size in some cases).

> +
> +	mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
> +		(min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
> +}
> +
> +static inline void mapping_set_folio_min_order(struct address_space *mapping,
> +					       unsigned int min)
> +{
> +	mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
> +}
> +
>  /**
>   * mapping_set_large_folios() - Indicate the file supports large folios.
> - * @mapping: The file.
> + * @mapping: The address space of the file.
>   *
>   * The filesystem should call this function in its inode constructor to
>   * indicate that the VFS can use large folios to cache the contents of
> @@ -373,7 +420,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
>   */
>  static inline void mapping_set_large_folios(struct address_space *mapping)
>  {
> -	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
> +	mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
> +}
> +
> +static inline
> +unsigned int mapping_max_folio_order(const struct address_space *mapping)
> +{
> +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> +		return 0;
> +	return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
> +}
> +
> +static inline
> +unsigned int mapping_min_folio_order(const struct address_space *mapping)
> +{
> +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> +		return 0;
> +	return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
>  }
>  
>  /*
> @@ -386,16 +449,13 @@ static inline bool mapping_large_folio_support(struct address_space *mapping)
>  	VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
>  			"Anonymous mapping always supports large folio");
>  
> -	return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
> -		test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
> +	return mapping_max_folio_order(mapping) > 0;
>  }
>  
>  /* Return the maximum folio size for this pagecache mapping, in bytes. */
> -static inline size_t mapping_max_folio_size(struct address_space *mapping)
> +static inline size_t mapping_max_folio_size(const struct address_space *mapping)
>  {
> -	if (mapping_large_folio_support(mapping))
> -		return PAGE_SIZE << MAX_PAGECACHE_ORDER;
> -	return PAGE_SIZE;
> +	return PAGE_SIZE << mapping_max_folio_order(mapping);
>  }
>  
>  static inline int filemap_nr_thps(struct address_space *mapping)
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 0b8c732bb643..d617c9afca51 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1933,10 +1933,8 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
>  		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
>  			fgp_flags |= FGP_LOCK;
>  
> -		if (!mapping_large_folio_support(mapping))
> -			order = 0;
> -		if (order > MAX_PAGECACHE_ORDER)
> -			order = MAX_PAGECACHE_ORDER;
> +		if (order > mapping_max_folio_order(mapping))
> +			order = mapping_max_folio_order(mapping);
>  		/* If we're not aligned, allocate a smaller folio */
>  		if (index & ((1UL << order) - 1))
>  			order = __ffs(index);
> diff --git a/mm/readahead.c b/mm/readahead.c
> index c1b23989d9ca..66058ae02f2e 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -503,9 +503,9 @@ void page_cache_ra_order(struct readahead_control *ractl,
>  
>  	limit = min(limit, index + ra->size - 1);
>  
> -	if (new_order < MAX_PAGECACHE_ORDER) {
> +	if (new_order < mapping_max_folio_order(mapping)) {
>  		new_order += 2;
> -		new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
> +		new_order = min(mapping_max_folio_order(mapping), new_order);
>  		new_order = min_t(unsigned int, new_order, ilog2(ra->size));

I wonder if its possible that ra->size could ever be less than
mapping_min_folio_order()? Do you need to handle that?

Thanks,
Ryan

>  	}
>
Matthew Wilcox July 4, 2024, 3:20 p.m. UTC | #2
On Thu, Jul 04, 2024 at 01:23:20PM +0100, Ryan Roberts wrote:
> > -	AS_LARGE_FOLIO_SUPPORT = 6,
> 
> nit: this removed enum is still referenced in a comment further down the file.

Thanks.  Pankaj, let me know if you want me to send you a patch or if
you'll do it directly.

> > +	/* Bits 16-25 are used for FOLIO_ORDER */
> > +	AS_FOLIO_ORDER_BITS = 5,
> > +	AS_FOLIO_ORDER_MIN = 16,
> > +	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
> 
> nit: These 3 new enums seem a bit odd.

Yes, this is "too many helpful suggestions" syndrome.  It made a lot
more sense originally.

https://lore.kernel.org/linux-fsdevel/ZlUQcEaP3FDXpCge@dread.disaster.area/

> > +static inline void mapping_set_folio_order_range(struct address_space *mapping,
> > +						 unsigned int min,
> > +						 unsigned int max)
> > +{
> > +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> > +		return;
> > +
> > +	if (min > MAX_PAGECACHE_ORDER)
> > +		min = MAX_PAGECACHE_ORDER;
> > +	if (max > MAX_PAGECACHE_ORDER)
> > +		max = MAX_PAGECACHE_ORDER;
> > +	if (max < min)
> > +		max = min;
> 
> It seems strange to silently clamp these? Presumably for the bs>ps usecase,
> whatever values are passed in are a hard requirement? So wouldn't want them to
> be silently reduced. (Especially given the recent change to reduce the size of
> MAX_PAGECACHE_ORDER to less then PMD size in some cases).

Hm, yes.  We should probably make this return an errno.  Including
returning an errno for !IS_ENABLED() and min > 0.

> > -	if (new_order < MAX_PAGECACHE_ORDER) {
> > +	if (new_order < mapping_max_folio_order(mapping)) {
> >  		new_order += 2;
> > -		new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
> > +		new_order = min(mapping_max_folio_order(mapping), new_order);
> >  		new_order = min_t(unsigned int, new_order, ilog2(ra->size));
> 
> I wonder if its possible that ra->size could ever be less than
> mapping_min_folio_order()? Do you need to handle that?

I think we take care of that in later patches?  This patch is mostly
about honouring the max properly and putting in the infrastructure for
the min, but not doing all the necessary work for min.
Ryan Roberts July 4, 2024, 3:52 p.m. UTC | #3
On 04/07/2024 16:20, Matthew Wilcox wrote:
> On Thu, Jul 04, 2024 at 01:23:20PM +0100, Ryan Roberts wrote:
>>> -	AS_LARGE_FOLIO_SUPPORT = 6,
>>
>> nit: this removed enum is still referenced in a comment further down the file.
> 
> Thanks.  Pankaj, let me know if you want me to send you a patch or if
> you'll do it directly.
> 
>>> +	/* Bits 16-25 are used for FOLIO_ORDER */
>>> +	AS_FOLIO_ORDER_BITS = 5,
>>> +	AS_FOLIO_ORDER_MIN = 16,
>>> +	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
>>
>> nit: These 3 new enums seem a bit odd.
> 
> Yes, this is "too many helpful suggestions" syndrome.  It made a lot
> more sense originally.

Well now you can add my "helpful" suggestion to that list :)

> 
> https://lore.kernel.org/linux-fsdevel/ZlUQcEaP3FDXpCge@dread.disaster.area/
> 
>>> +static inline void mapping_set_folio_order_range(struct address_space *mapping,
>>> +						 unsigned int min,
>>> +						 unsigned int max)
>>> +{
>>> +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
>>> +		return;
>>> +
>>> +	if (min > MAX_PAGECACHE_ORDER)
>>> +		min = MAX_PAGECACHE_ORDER;
>>> +	if (max > MAX_PAGECACHE_ORDER)
>>> +		max = MAX_PAGECACHE_ORDER;
>>> +	if (max < min)
>>> +		max = min;
>>
>> It seems strange to silently clamp these? Presumably for the bs>ps usecase,
>> whatever values are passed in are a hard requirement? So wouldn't want them to
>> be silently reduced. (Especially given the recent change to reduce the size of
>> MAX_PAGECACHE_ORDER to less then PMD size in some cases).
> 
> Hm, yes.  We should probably make this return an errno.  Including
> returning an errno for !IS_ENABLED() and min > 0.

Right.

> 
>>> -	if (new_order < MAX_PAGECACHE_ORDER) {
>>> +	if (new_order < mapping_max_folio_order(mapping)) {
>>>  		new_order += 2;
>>> -		new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
>>> +		new_order = min(mapping_max_folio_order(mapping), new_order);
>>>  		new_order = min_t(unsigned int, new_order, ilog2(ra->size));
>>
>> I wonder if its possible that ra->size could ever be less than
>> mapping_min_folio_order()? Do you need to handle that?
> 
> I think we take care of that in later patches? 

Yes I saw that once I got to it. You can ignore this.

> This patch is mostly
> about honouring the max properly and putting in the infrastructure for
> the min, but not doing all the necessary work for min.
Pankaj Raghav (Samsung) July 4, 2024, 9:28 p.m. UTC | #4
On Thu, Jul 04, 2024 at 04:20:13PM +0100, Matthew Wilcox wrote:
> On Thu, Jul 04, 2024 at 01:23:20PM +0100, Ryan Roberts wrote:
> > > -	AS_LARGE_FOLIO_SUPPORT = 6,
> > 
> > nit: this removed enum is still referenced in a comment further down the file.
Good catch.
> 
> Thanks.  Pankaj, let me know if you want me to send you a patch or if
> you'll do it directly.
Yes, I will fold the changes.
> 
> > > +static inline void mapping_set_folio_order_range(struct address_space *mapping,
> > > +						 unsigned int min,
> > > +						 unsigned int max)
> > > +{
> > > +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> > > +		return;
> > > +
> > > +	if (min > MAX_PAGECACHE_ORDER)
> > > +		min = MAX_PAGECACHE_ORDER;
> > > +	if (max > MAX_PAGECACHE_ORDER)
> > > +		max = MAX_PAGECACHE_ORDER;
> > > +	if (max < min)
> > > +		max = min;
> > 
> > It seems strange to silently clamp these? Presumably for the bs>ps usecase,
> > whatever values are passed in are a hard requirement? So wouldn't want them to
> > be silently reduced. (Especially given the recent change to reduce the size of
> > MAX_PAGECACHE_ORDER to less then PMD size in some cases).
> 
> Hm, yes.  We should probably make this return an errno.  Including
> returning an errno for !IS_ENABLED() and min > 0.
> 

Something like this? (I also need to change the xfs_icache.c to
use this return value in the last patch)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 14e1415f7dcf..04916720f807 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -390,28 +390,27 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
  * Context: This should not be called while the inode is active as it
  * is non-atomic.
  */
-static inline void mapping_set_folio_order_range(struct address_space *mapping,
+static inline int mapping_set_folio_order_range(struct address_space *mapping,
                                                 unsigned int min,
                                                 unsigned int max)
 {
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
-               return;
+               return -EINVAL;
 
-       if (min > MAX_PAGECACHE_ORDER)
-               min = MAX_PAGECACHE_ORDER;
-       if (max > MAX_PAGECACHE_ORDER)
-               max = MAX_PAGECACHE_ORDER;
+       if (min > MAX_PAGECACHE_ORDER || max > MAX_PAGECACHE_ORDER)
+               return -EINVAL;
        if (max < min)
                max = min;
 
        mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
                (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
+       return 0;
 }
 
-static inline void mapping_set_folio_min_order(struct address_space *mapping,
+static inline int mapping_set_folio_min_order(struct address_space *mapping,
                                               unsigned int min)
 {
-       mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
+       return mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
 }
 
 
@@ -428,6 +427,10 @@ static inline void mapping_set_folio_min_order(struct address_space *mapping,
  */
 static inline void mapping_set_large_folios(struct address_space *mapping)
 {
+       /*
+        * The return value can be safely ignored because this range
+        * will always be supported by the page cache.
+        */
        mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
 }

 --
 Pankaj
Pankaj Raghav (Samsung) July 4, 2024, 9:34 p.m. UTC | #5
On Thu, Jul 04, 2024 at 01:23:20PM +0100, Ryan Roberts wrote:
> Hi,
> 
> Here are some drive-by review comments as I'm evaluating whether these patches
> can help me with what I'm trying to do at
> https://lore.kernel.org/linux-mm/20240215154059.2863126-1-ryan.roberts@arm.com/...

Just FYI, I posted the 9th version[0] today before these comments landed
and they do not your proposed changes. 

And it will also be better if you can put your comments in the latest
version just to avoid fragmentation. :)

[0]https://lore.kernel.org/linux-fsdevel/20240704112320.82104-1-kernel@pankajraghav.com/
Dave Chinner July 4, 2024, 10:06 p.m. UTC | #6
On Thu, Jul 04, 2024 at 04:20:13PM +0100, Matthew Wilcox wrote:
> On Thu, Jul 04, 2024 at 01:23:20PM +0100, Ryan Roberts wrote:
> > > -	AS_LARGE_FOLIO_SUPPORT = 6,
> > 
> > nit: this removed enum is still referenced in a comment further down the file.
> 
> Thanks.  Pankaj, let me know if you want me to send you a patch or if
> you'll do it directly.
> 
> > > +	/* Bits 16-25 are used for FOLIO_ORDER */
> > > +	AS_FOLIO_ORDER_BITS = 5,
> > > +	AS_FOLIO_ORDER_MIN = 16,
> > > +	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
> > 
> > nit: These 3 new enums seem a bit odd.
> 
> Yes, this is "too many helpful suggestions" syndrome.  It made a lot
> more sense originally.
> 
> https://lore.kernel.org/linux-fsdevel/ZlUQcEaP3FDXpCge@dread.disaster.area/
> 
> > > +static inline void mapping_set_folio_order_range(struct address_space *mapping,
> > > +						 unsigned int min,
> > > +						 unsigned int max)
> > > +{
> > > +	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> > > +		return;
> > > +
> > > +	if (min > MAX_PAGECACHE_ORDER)
> > > +		min = MAX_PAGECACHE_ORDER;
> > > +	if (max > MAX_PAGECACHE_ORDER)
> > > +		max = MAX_PAGECACHE_ORDER;
> > > +	if (max < min)
> > > +		max = min;
> > 
> > It seems strange to silently clamp these? Presumably for the bs>ps usecase,
> > whatever values are passed in are a hard requirement? So wouldn't want them to
> > be silently reduced. (Especially given the recent change to reduce the size of
> > MAX_PAGECACHE_ORDER to less then PMD size in some cases).
> 
> Hm, yes.  We should probably make this return an errno.  Including
> returning an errno for !IS_ENABLED() and min > 0.

What are callers supposed to do with an error? In the case of
setting up a newly allocated inode in XFS, the error would be
returned in the middle of a transaction and so this failure would
result in a filesystem shutdown.

Regardless, the filesystem should never be passing min >
MAX_PAGECACHE_ORDER any time soon for bs > ps configurations. block
sizes go up to 64kB, which is a lot smaller than
MAX_PAGECACHE_ORDER.  IOWs, seeing min > MAX_PAGECACHE_ORDER is
indicative of a severe bug, should be considered a fatal developer
mistake and the kernel terminated immediately. Such mistakes should
-never, ever- happen on productions systems. IOWs, this is a
situation where we should assert or bug and kill the kernel
immediately, or at minimum warn-on-once() and truncate the value and
hope things don't get immediately worse.

If we kill the kernel because min is out of range, the system will
fail on the first inode instantiation on that filesystem.
Filesystem developers should notice that sort of failure pretty
quickly and realise they've done something that isn't currently
supported...

-Dave.
Matthew Wilcox July 4, 2024, 11:56 p.m. UTC | #7
On Fri, Jul 05, 2024 at 08:06:51AM +1000, Dave Chinner wrote:
> > > It seems strange to silently clamp these? Presumably for the bs>ps usecase,
> > > whatever values are passed in are a hard requirement? So wouldn't want them to
> > > be silently reduced. (Especially given the recent change to reduce the size of
> > > MAX_PAGECACHE_ORDER to less then PMD size in some cases).
> > 
> > Hm, yes.  We should probably make this return an errno.  Including
> > returning an errno for !IS_ENABLED() and min > 0.
> 
> What are callers supposed to do with an error? In the case of
> setting up a newly allocated inode in XFS, the error would be
> returned in the middle of a transaction and so this failure would
> result in a filesystem shutdown.

I suggest you handle it better than this.  If the device is asking for a
blocksize > PMD_SIZE, you should fail to mount it.  If the device is
asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
not set, you should also decline to mount the filesystem.
Dave Chinner July 5, 2024, 4:32 a.m. UTC | #8
On Fri, Jul 05, 2024 at 12:56:28AM +0100, Matthew Wilcox wrote:
> On Fri, Jul 05, 2024 at 08:06:51AM +1000, Dave Chinner wrote:
> > > > It seems strange to silently clamp these? Presumably for the bs>ps usecase,
> > > > whatever values are passed in are a hard requirement? So wouldn't want them to
> > > > be silently reduced. (Especially given the recent change to reduce the size of
> > > > MAX_PAGECACHE_ORDER to less then PMD size in some cases).
> > > 
> > > Hm, yes.  We should probably make this return an errno.  Including
> > > returning an errno for !IS_ENABLED() and min > 0.
> > 
> > What are callers supposed to do with an error? In the case of
> > setting up a newly allocated inode in XFS, the error would be
> > returned in the middle of a transaction and so this failure would
> > result in a filesystem shutdown.
> 
> I suggest you handle it better than this.  If the device is asking for a
> blocksize > PMD_SIZE, you should fail to mount it.

That's my point: we already do that.

The largest block size we support is 64kB and that's way smaller
than PMD_SIZE on all platforms and we always check for bs > ps 
support at mount time when the filesystem bs > ps.

Hence we're never going to set the min value to anything unsupported
unless someone makes a massive programming mistake. At which point,
we want a *hard, immediate fail* so the developer notices their
mistake immediately. All filesystems and block devices need to
behave this way so the limits should be encoded as asserts in the
function to trigger such behaviour.

> If the device is
> asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
> not set, you should also decline to mount the filesystem.

What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
being able to use large folios?

If that's an actual dependency of using large folios, then we're at
the point where the mm side of large folios needs to be divorced
from CONFIG_TRANSPARENT_HUGEPAGE and always supported.
Alternatively, CONFIG_TRANSPARENT_HUGEPAGE needs to selected by the
block layer and also every filesystem that wants to support
sector/blocks sizes larger than PAGE_SIZE.  IOWs, large folio
support needs to *always* be enabled on systems that say
CONFIG_BLOCK=y.

I'd much prefer the former occurs, because making the block layer
and filesystems dependent on an mm feature they don't actually use
is kinda weird...

-Dave.
Ryan Roberts July 5, 2024, 9:03 a.m. UTC | #9
On 05/07/2024 05:32, Dave Chinner wrote:
> On Fri, Jul 05, 2024 at 12:56:28AM +0100, Matthew Wilcox wrote:
>> On Fri, Jul 05, 2024 at 08:06:51AM +1000, Dave Chinner wrote:
>>>>> It seems strange to silently clamp these? Presumably for the bs>ps usecase,
>>>>> whatever values are passed in are a hard requirement? So wouldn't want them to
>>>>> be silently reduced. (Especially given the recent change to reduce the size of
>>>>> MAX_PAGECACHE_ORDER to less then PMD size in some cases).
>>>>
>>>> Hm, yes.  We should probably make this return an errno.  Including
>>>> returning an errno for !IS_ENABLED() and min > 0.
>>>
>>> What are callers supposed to do with an error? In the case of
>>> setting up a newly allocated inode in XFS, the error would be
>>> returned in the middle of a transaction and so this failure would
>>> result in a filesystem shutdown.
>>
>> I suggest you handle it better than this.  If the device is asking for a
>> blocksize > PMD_SIZE, you should fail to mount it.

A detail, but MAX_PAGECACHE_ORDER may be smaller than PMD_SIZE even on systems
with CONFIG_TRANSPARENT_HUGEPAGE as of a fix that is currently in mm-unstable:

	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	#define PREFERRED_MAX_PAGECACHE_ORDER	HPAGE_PMD_ORDER
	#else
	#define PREFERRED_MAX_PAGECACHE_ORDER	8
	#endif

	/*
	 * xas_split_alloc() does not support arbitrary orders. This implies no
	 * 512MB THP on ARM64 with 64KB base page size.
	 */
	#define MAX_XAS_ORDER		(XA_CHUNK_SHIFT * 2 - 1)
	#define MAX_PAGECACHE_ORDER	min(MAX_XAS_ORDER,
					    PREFERRED_MAX_PAGECACHE_ORDER)

But that also implies that the page cache can handle up to order-8 without
CONFIG_TRANSPARENT_HUGEPAGE so sounds like there isn't a dependcy on
CONFIG_TRANSPARENT_HUGEPAGE in this respect?



> 
> That's my point: we already do that.
> 
> The largest block size we support is 64kB and that's way smaller
> than PMD_SIZE on all platforms and we always check for bs > ps 
> support at mount time when the filesystem bs > ps.
> 
> Hence we're never going to set the min value to anything unsupported
> unless someone makes a massive programming mistake. At which point,
> we want a *hard, immediate fail* so the developer notices their
> mistake immediately. All filesystems and block devices need to
> behave this way so the limits should be encoded as asserts in the
> function to trigger such behaviour.
> 
>> If the device is
>> asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
>> not set, you should also decline to mount the filesystem.
> 
> What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
> being able to use large folios?
> 
> If that's an actual dependency of using large folios, then we're at
> the point where the mm side of large folios needs to be divorced
> from CONFIG_TRANSPARENT_HUGEPAGE and always supported.
> Alternatively, CONFIG_TRANSPARENT_HUGEPAGE needs to selected by the
> block layer and also every filesystem that wants to support
> sector/blocks sizes larger than PAGE_SIZE.  IOWs, large folio
> support needs to *always* be enabled on systems that say
> CONFIG_BLOCK=y.
> 
> I'd much prefer the former occurs, because making the block layer
> and filesystems dependent on an mm feature they don't actually use
> is kinda weird...
> 
> -Dave.
>
Pankaj Raghav (Samsung) July 5, 2024, 12:45 p.m. UTC | #10
On Fri, Jul 05, 2024 at 10:03:31AM +0100, Ryan Roberts wrote:
> On 05/07/2024 05:32, Dave Chinner wrote:
> > On Fri, Jul 05, 2024 at 12:56:28AM +0100, Matthew Wilcox wrote:
> >> On Fri, Jul 05, 2024 at 08:06:51AM +1000, Dave Chinner wrote:
> >>>>> It seems strange to silently clamp these? Presumably for the bs>ps usecase,
> >>>>> whatever values are passed in are a hard requirement? So wouldn't want them to
> >>>>> be silently reduced. (Especially given the recent change to reduce the size of
> >>>>> MAX_PAGECACHE_ORDER to less then PMD size in some cases).
> >>>>
> >>>> Hm, yes.  We should probably make this return an errno.  Including
> >>>> returning an errno for !IS_ENABLED() and min > 0.
> >>>
> >>> What are callers supposed to do with an error? In the case of
> >>> setting up a newly allocated inode in XFS, the error would be
> >>> returned in the middle of a transaction and so this failure would
> >>> result in a filesystem shutdown.
> >>
> >> I suggest you handle it better than this.  If the device is asking for a
> >> blocksize > PMD_SIZE, you should fail to mount it.
> 
> A detail, but MAX_PAGECACHE_ORDER may be smaller than PMD_SIZE even on systems
> with CONFIG_TRANSPARENT_HUGEPAGE as of a fix that is currently in mm-unstable:

> 
> 	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> 	#define PREFERRED_MAX_PAGECACHE_ORDER	HPAGE_PMD_ORDER
> 	#else
> 	#define PREFERRED_MAX_PAGECACHE_ORDER	8
> 	#endif
> 
> 	/*
> 	 * xas_split_alloc() does not support arbitrary orders. This implies no
> 	 * 512MB THP on ARM64 with 64KB base page size.
> 	 */
> 	#define MAX_XAS_ORDER		(XA_CHUNK_SHIFT * 2 - 1)
> 	#define MAX_PAGECACHE_ORDER	min(MAX_XAS_ORDER,
> 					    PREFERRED_MAX_PAGECACHE_ORDER)
> 
> But that also implies that the page cache can handle up to order-8 without
> CONFIG_TRANSPARENT_HUGEPAGE so sounds like there isn't a dependcy on
> CONFIG_TRANSPARENT_HUGEPAGE in this respect?
> 

I remember seeing willy's comment about dependency on CONFIG_TRANSPARENT_HUGEPAGE 
for large folios[0]:

Some parts of the VM still depend on THP to handle large folios
correctly.  Until those are fixed, prevent creating large folios
if THP are disabled.

This was Jan of 2022. I don't know the status of it now but we enable
large folios for filesystem only when THP is enabled(as you can see in
the helpers mapping_set_folio_order_range()).

[0] https://lore.kernel.org/lkml/20220116121822.1727633-8-willy@infradead.org/
Pankaj Raghav (Samsung) July 5, 2024, 1:24 p.m. UTC | #11
> > I suggest you handle it better than this.  If the device is asking for a
> > blocksize > PMD_SIZE, you should fail to mount it.
> 
> That's my point: we already do that.
> 
> The largest block size we support is 64kB and that's way smaller
> than PMD_SIZE on all platforms and we always check for bs > ps 
> support at mount time when the filesystem bs > ps.
> 
> Hence we're never going to set the min value to anything unsupported
> unless someone makes a massive programming mistake. At which point,
> we want a *hard, immediate fail* so the developer notices their
> mistake immediately. All filesystems and block devices need to
> behave this way so the limits should be encoded as asserts in the
> function to trigger such behaviour.

I agree, this kind of bug will be encountered only during developement 
and not during actual production due to the limit we have fs block size
in XFS.

> 
> > If the device is
> > asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
> > not set, you should also decline to mount the filesystem.
> 
> What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
> being able to use large folios?
> 
> If that's an actual dependency of using large folios, then we're at
> the point where the mm side of large folios needs to be divorced
> from CONFIG_TRANSPARENT_HUGEPAGE and always supported.
> Alternatively, CONFIG_TRANSPARENT_HUGEPAGE needs to selected by the
> block layer and also every filesystem that wants to support
> sector/blocks sizes larger than PAGE_SIZE.  IOWs, large folio
> support needs to *always* be enabled on systems that say
> CONFIG_BLOCK=y.

Why CONFIG_BLOCK? I think it is enough if it comes from the FS side
right? And for now, the only FS that needs that sort of bs > ps 
guarantee is XFS with this series. Other filesystems such as bcachefs 
that call mapping_set_large_folios() only enable it as an optimization
and it is not needed for the filesystem to function.

So this is my conclusion from the conversation:
- Add a dependency in Kconfig on THP for XFS until we fix the dependency
  of large folios on THP
- Add a BUILD_BUG_ON(XFS_MAX_BLOCKSIZE > MAX_PAGECACHE_ORDER)
- Add a WARN_ON_ONCE() and clamp the min and max value in
  mapping_set_folio_order_range() ?

Let me know what you all think @willy, @dave and @ryan.

--
Pankaj
Ryan Roberts July 5, 2024, 1:31 p.m. UTC | #12
On 05/07/2024 14:24, Pankaj Raghav (Samsung) wrote:
>>> I suggest you handle it better than this.  If the device is asking for a
>>> blocksize > PMD_SIZE, you should fail to mount it.
>>
>> That's my point: we already do that.
>>
>> The largest block size we support is 64kB and that's way smaller
>> than PMD_SIZE on all platforms and we always check for bs > ps 
>> support at mount time when the filesystem bs > ps.
>>
>> Hence we're never going to set the min value to anything unsupported
>> unless someone makes a massive programming mistake. At which point,
>> we want a *hard, immediate fail* so the developer notices their
>> mistake immediately. All filesystems and block devices need to
>> behave this way so the limits should be encoded as asserts in the
>> function to trigger such behaviour.
> 
> I agree, this kind of bug will be encountered only during developement 
> and not during actual production due to the limit we have fs block size
> in XFS.
> 
>>
>>> If the device is
>>> asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
>>> not set, you should also decline to mount the filesystem.
>>
>> What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
>> being able to use large folios?
>>
>> If that's an actual dependency of using large folios, then we're at
>> the point where the mm side of large folios needs to be divorced
>> from CONFIG_TRANSPARENT_HUGEPAGE and always supported.
>> Alternatively, CONFIG_TRANSPARENT_HUGEPAGE needs to selected by the
>> block layer and also every filesystem that wants to support
>> sector/blocks sizes larger than PAGE_SIZE.  IOWs, large folio
>> support needs to *always* be enabled on systems that say
>> CONFIG_BLOCK=y.
> 
> Why CONFIG_BLOCK? I think it is enough if it comes from the FS side
> right? And for now, the only FS that needs that sort of bs > ps 
> guarantee is XFS with this series. Other filesystems such as bcachefs 
> that call mapping_set_large_folios() only enable it as an optimization
> and it is not needed for the filesystem to function.
> 
> So this is my conclusion from the conversation:
> - Add a dependency in Kconfig on THP for XFS until we fix the dependency
>   of large folios on THP

THP isn't supported on some arches, so isn't this effectively saying XFS can no
longer be used with those arches, even if the bs <= ps? I think while pagecache
large folios depend on THP, you need to make this a mount-time check in the FS?

But ideally, MAX_PAGECACHE_ORDER would be set to 0 for
!CONFIG_TRANSPARENT_HUGEPAGE so you can just check against that and don't have
to worry about THP availability directly.

Willy; Why is MAX_PAGECACHE_ORDER set to 8 when THP is disabled currently?

> - Add a BUILD_BUG_ON(XFS_MAX_BLOCKSIZE > MAX_PAGECACHE_ORDER)
> - Add a WARN_ON_ONCE() and clamp the min and max value in
>   mapping_set_folio_order_range() ?
> 
> Let me know what you all think @willy, @dave and @ryan.
> 
> --
> Pankaj
Pankaj Raghav (Samsung) July 5, 2024, 2:14 p.m. UTC | #13
> >>
> >>> If the device is
> >>> asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
> >>> not set, you should also decline to mount the filesystem.
> >>
> >> What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
> >> being able to use large folios?
> >>
> >> If that's an actual dependency of using large folios, then we're at
> >> the point where the mm side of large folios needs to be divorced
> >> from CONFIG_TRANSPARENT_HUGEPAGE and always supported.
> >> Alternatively, CONFIG_TRANSPARENT_HUGEPAGE needs to selected by the
> >> block layer and also every filesystem that wants to support
> >> sector/blocks sizes larger than PAGE_SIZE.  IOWs, large folio
> >> support needs to *always* be enabled on systems that say
> >> CONFIG_BLOCK=y.
> > 
> > Why CONFIG_BLOCK? I think it is enough if it comes from the FS side
> > right? And for now, the only FS that needs that sort of bs > ps 
> > guarantee is XFS with this series. Other filesystems such as bcachefs 
> > that call mapping_set_large_folios() only enable it as an optimization
> > and it is not needed for the filesystem to function.
> > 
> > So this is my conclusion from the conversation:
> > - Add a dependency in Kconfig on THP for XFS until we fix the dependency
> >   of large folios on THP
> 
> THP isn't supported on some arches, so isn't this effectively saying XFS can no
> longer be used with those arches, even if the bs <= ps? I think while pagecache
> large folios depend on THP, you need to make this a mount-time check in the FS?
> 
> But ideally, MAX_PAGECACHE_ORDER would be set to 0 for
> !CONFIG_TRANSPARENT_HUGEPAGE so you can just check against that and don't have
> to worry about THP availability directly.

Yes, that would be better. We should have a way to probe it during mount
time without requiring any address_space mapping. We could have a helper
something as follows:

static inline unsigned int mapping_max_folio_order_supported()
{
    if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
      return 0;
    return MAX_PAGECACHE_ORDER;
}

This could be used by the FS to verify during mount time.

> 
> Willy; Why is MAX_PAGECACHE_ORDER set to 8 when THP is disabled currently?
> 

This appeared in this patch with the following comment:
https://lore.kernel.org/linux-fsdevel/20230710130253.3484695-8-willy@infradead.org/
 
+/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size.  I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */ 

> > - Add a BUILD_BUG_ON(XFS_MAX_BLOCKSIZE > MAX_PAGECACHE_ORDER)
> > - Add a WARN_ON_ONCE() and clamp the min and max value in
> >   mapping_set_folio_order_range() ?
> > 
> > Let me know what you all think @willy, @dave and @ryan.
> > 
> > --
> > Pankaj
>
Matthew Wilcox July 5, 2024, 3:14 p.m. UTC | #14
On Fri, Jul 05, 2024 at 02:32:37PM +1000, Dave Chinner wrote:
> > If the device is
> > asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
> > not set, you should also decline to mount the filesystem.
> 
> What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
> being able to use large folios?

You can't have large folios without CONFIG_TRANSPARENT_HUGEPAGE.
There's a bunch of infrastructure that's gated behind that.  I used to say
"somebody should fix that up", but honestly the amount of work I'm willing
to do to support hobbyist architectures has decreased dramatically.
Any useful system has THP support.
Dave Chinner July 8, 2024, 11:01 p.m. UTC | #15
On Fri, Jul 05, 2024 at 02:31:08PM +0100, Ryan Roberts wrote:
> On 05/07/2024 14:24, Pankaj Raghav (Samsung) wrote:
> >>> I suggest you handle it better than this.  If the device is asking for a
> >>> blocksize > PMD_SIZE, you should fail to mount it.
> >>
> >> That's my point: we already do that.
> >>
> >> The largest block size we support is 64kB and that's way smaller
> >> than PMD_SIZE on all platforms and we always check for bs > ps 
> >> support at mount time when the filesystem bs > ps.
> >>
> >> Hence we're never going to set the min value to anything unsupported
> >> unless someone makes a massive programming mistake. At which point,
> >> we want a *hard, immediate fail* so the developer notices their
> >> mistake immediately. All filesystems and block devices need to
> >> behave this way so the limits should be encoded as asserts in the
> >> function to trigger such behaviour.
> > 
> > I agree, this kind of bug will be encountered only during developement 
> > and not during actual production due to the limit we have fs block size
> > in XFS.
> > 
> >>
> >>> If the device is
> >>> asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
> >>> not set, you should also decline to mount the filesystem.
> >>
> >> What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
> >> being able to use large folios?
> >>
> >> If that's an actual dependency of using large folios, then we're at
> >> the point where the mm side of large folios needs to be divorced
> >> from CONFIG_TRANSPARENT_HUGEPAGE and always supported.
> >> Alternatively, CONFIG_TRANSPARENT_HUGEPAGE needs to selected by the
> >> block layer and also every filesystem that wants to support
> >> sector/blocks sizes larger than PAGE_SIZE.  IOWs, large folio
> >> support needs to *always* be enabled on systems that say
> >> CONFIG_BLOCK=y.
> > 
> > Why CONFIG_BLOCK? I think it is enough if it comes from the FS side
> > right? And for now, the only FS that needs that sort of bs > ps 
> > guarantee is XFS with this series. Other filesystems such as bcachefs 
> > that call mapping_set_large_folios() only enable it as an optimization
> > and it is not needed for the filesystem to function.
> > 
> > So this is my conclusion from the conversation:
> > - Add a dependency in Kconfig on THP for XFS until we fix the dependency
> >   of large folios on THP
> 
> THP isn't supported on some arches, so isn't this effectively saying XFS can no
> longer be used with those arches, even if the bs <= ps?

I'm good with that - we're already long past the point where we try
to support XFS on every linux platform. Indeed, we've recent been
musing about making XFS depend on 64 bit only - 32 bit systems don't
have the memory capacity to run the full xfs tool chain (e.g.
xfs_repair) on filesystems over about a TB in size, and they are
greatly limited in kernel memory and vmap areas, both of which XFS
makes heavy use of. Basically, friends don't let friends use XFS on
32 bit systems, and that's been true for about 20 years now.

Our problem is the test matrix - if we now have to explicitly test
XFS both with and without large folios enabled to support these
platforms, we've just doubled our test matrix. The test matrix is
already far too large to robustly cover, so anything that requires
doubling the number of kernel configs we have to test is, IMO, a
non-starter.

That's why we really don't support XFS on 32 bit systems anymore and
why we're talking about making that official with a config option.
If we're at the point where XFS will now depend on large folios (i.e
THP), then we need to seriously consider reducing the supported
arches to just those that support both 64 bit and THP. If niche
arches want to support THP, or enable large folios without the need
for THP, then they can do that work and then they get XFS for
free.

Just because an arch might run a Linux kernel, it doesn't mean we
have to support XFS on it....

-Dave.
Ryan Roberts July 9, 2024, 8:11 a.m. UTC | #16
On 09/07/2024 00:01, Dave Chinner wrote:
> On Fri, Jul 05, 2024 at 02:31:08PM +0100, Ryan Roberts wrote:
>> On 05/07/2024 14:24, Pankaj Raghav (Samsung) wrote:
>>>>> I suggest you handle it better than this.  If the device is asking for a
>>>>> blocksize > PMD_SIZE, you should fail to mount it.
>>>>
>>>> That's my point: we already do that.
>>>>
>>>> The largest block size we support is 64kB and that's way smaller
>>>> than PMD_SIZE on all platforms and we always check for bs > ps 
>>>> support at mount time when the filesystem bs > ps.
>>>>
>>>> Hence we're never going to set the min value to anything unsupported
>>>> unless someone makes a massive programming mistake. At which point,
>>>> we want a *hard, immediate fail* so the developer notices their
>>>> mistake immediately. All filesystems and block devices need to
>>>> behave this way so the limits should be encoded as asserts in the
>>>> function to trigger such behaviour.
>>>
>>> I agree, this kind of bug will be encountered only during developement 
>>> and not during actual production due to the limit we have fs block size
>>> in XFS.
>>>
>>>>
>>>>> If the device is
>>>>> asking for a blocksize > PAGE_SIZE and CONFIG_TRANSPARENT_HUGEPAGE is
>>>>> not set, you should also decline to mount the filesystem.
>>>>
>>>> What does CONFIG_TRANSPARENT_HUGEPAGE have to do with filesystems
>>>> being able to use large folios?
>>>>
>>>> If that's an actual dependency of using large folios, then we're at
>>>> the point where the mm side of large folios needs to be divorced
>>>> from CONFIG_TRANSPARENT_HUGEPAGE and always supported.
>>>> Alternatively, CONFIG_TRANSPARENT_HUGEPAGE needs to selected by the
>>>> block layer and also every filesystem that wants to support
>>>> sector/blocks sizes larger than PAGE_SIZE.  IOWs, large folio
>>>> support needs to *always* be enabled on systems that say
>>>> CONFIG_BLOCK=y.
>>>
>>> Why CONFIG_BLOCK? I think it is enough if it comes from the FS side
>>> right? And for now, the only FS that needs that sort of bs > ps 
>>> guarantee is XFS with this series. Other filesystems such as bcachefs 
>>> that call mapping_set_large_folios() only enable it as an optimization
>>> and it is not needed for the filesystem to function.
>>>
>>> So this is my conclusion from the conversation:
>>> - Add a dependency in Kconfig on THP for XFS until we fix the dependency
>>>   of large folios on THP
>>
>> THP isn't supported on some arches, so isn't this effectively saying XFS can no
>> longer be used with those arches, even if the bs <= ps?
> 
> I'm good with that - we're already long past the point where we try
> to support XFS on every linux platform. Indeed, we've recent been
> musing about making XFS depend on 64 bit only - 32 bit systems don't
> have the memory capacity to run the full xfs tool chain (e.g.
> xfs_repair) on filesystems over about a TB in size, and they are
> greatly limited in kernel memory and vmap areas, both of which XFS
> makes heavy use of. Basically, friends don't let friends use XFS on
> 32 bit systems, and that's been true for about 20 years now.
> 
> Our problem is the test matrix - if we now have to explicitly test
> XFS both with and without large folios enabled to support these
> platforms, we've just doubled our test matrix. The test matrix is
> already far too large to robustly cover, so anything that requires
> doubling the number of kernel configs we have to test is, IMO, a
> non-starter.
> 
> That's why we really don't support XFS on 32 bit systems anymore and
> why we're talking about making that official with a config option.
> If we're at the point where XFS will now depend on large folios (i.e
> THP), then we need to seriously consider reducing the supported
> arches to just those that support both 64 bit and THP. If niche
> arches want to support THP, or enable large folios without the need
> for THP, then they can do that work and then they get XFS for
> free.
> 
> Just because an arch might run a Linux kernel, it doesn't mean we
> have to support XFS on it....

OK. I was just pointing out the impact of adding this Kconfig dependency. If
that impact is explicitly considered and desired, then great. I'll leave you to it.

Thanks,
Ryan

> 
> -Dave.
Pankaj Raghav (Samsung) July 9, 2024, 1:08 p.m. UTC | #17
> > > 
> > > Why CONFIG_BLOCK? I think it is enough if it comes from the FS side
> > > right? And for now, the only FS that needs that sort of bs > ps 
> > > guarantee is XFS with this series. Other filesystems such as bcachefs 
> > > that call mapping_set_large_folios() only enable it as an optimization
> > > and it is not needed for the filesystem to function.
> > > 
> > > So this is my conclusion from the conversation:
> > > - Add a dependency in Kconfig on THP for XFS until we fix the dependency
> > >   of large folios on THP
> > 
> > THP isn't supported on some arches, so isn't this effectively saying XFS can no
> > longer be used with those arches, even if the bs <= ps?
> 
> I'm good with that - we're already long past the point where we try

From my cursory review, I can see that the following arch supports THP
(* indicates it has some dependency on other Kconfig parameter):

arc*, arm*, arm64, loongarch, mips*, powerpc*, riscv*, s390, sparc, x86.

and the following do not have THP support:

alpha, csky, hexagon, m68k, microblaze, nios2, openrisc, parisc, sh, um,
xtensa.

Looks like the arch that do not THP support are either old or embedded
processor that target mainly 32-bit architecture.

So are we OK with?

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index d41edd30388b7..be2c1c0e9fe8b 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -5,6 +5,7 @@ config XFS_FS
        select EXPORTFS
        select LIBCRC32C
        select FS_IOMAP
+       select TRANSPARENT_HUGEPAGE
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can

> to support XFS on every linux platform. Indeed, we've recent been
> musing about making XFS depend on 64 bit only - 32 bit systems don't
> have the memory capacity to run the full xfs tool chain (e.g.
> xfs_repair) on filesystems over about a TB in size, and they are
> greatly limited in kernel memory and vmap areas, both of which XFS
> makes heavy use of. Basically, friends don't let friends use XFS on
> 32 bit systems, and that's been true for about 20 years now.
> 
> Our problem is the test matrix - if we now have to explicitly test
> XFS both with and without large folios enabled to support these
> platforms, we've just doubled our test matrix. The test matrix is
> already far too large to robustly cover, so anything that requires
> doubling the number of kernel configs we have to test is, IMO, a
> non-starter.
> 
> That's why we really don't support XFS on 32 bit systems anymore and
> why we're talking about making that official with a config option.
> If we're at the point where XFS will now depend on large folios (i.e
> THP), then we need to seriously consider reducing the supported
> arches to just those that support both 64 bit and THP. If niche
> arches want to support THP, or enable large folios without the need
> for THP, then they can do that work and then they get XFS for
> free.
> 
> Just because an arch might run a Linux kernel, it doesn't mean we
> have to support XFS on it....

The other option is we can expose a simple helper from page cache as
follows:

static inline unsigned int mapping_max_folio_order_supported()
{
    if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
      return 0;
    return MAX_PAGECACHE_ORDER;
}

This could be used to know the maximum order supported at mount time and
deny mounting for LBS configs if max folio supported is less than the
block size being requested.
Pankaj Raghav (Samsung) July 9, 2024, 4:29 p.m. UTC | #18
For now, this is the only patch that is blocking for the next version.

Based on the discussion, is the following logical @ryan, @dave and
@willy?

- We give explicit VM_WARN_ONCE if we try to set folio order range if
  the THP is disabled, min and max is greater than MAX_PAGECACHE_ORDER.

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 14e1415f7dcf4..313c9fad61859 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -394,13 +394,24 @@ static inline void mapping_set_folio_order_range(struct address_space *mapping,
                                                 unsigned int min,
                                                 unsigned int max)
 {
-       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+               VM_WARN_ONCE(1, 
+       "THP needs to be enabled to support mapping folio order range");
                return;
+       }
 
-       if (min > MAX_PAGECACHE_ORDER)
+       if (min > MAX_PAGECACHE_ORDER) {
+               VM_WARN_ONCE(1, 
+       "min order > MAX_PAGECACHE_ORDER. Setting min_order to MAX_PAGECACHE_ORDER");
                min = MAX_PAGECACHE_ORDER;
-       if (max > MAX_PAGECACHE_ORDER)
+       }
+
+       if (max > MAX_PAGECACHE_ORDER) {
+               VM_WARN_ONCE(1, 
+       "max order > MAX_PAGECACHE_ORDER. Setting max_order to MAX_PAGECACHE_ORDER");
                max = MAX_PAGECACHE_ORDER;
+       }
+
        if (max < min)
                max = min;

- We make THP an explicit dependency for XFS:

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index d41edd30388b7..be2c1c0e9fe8b 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -5,6 +5,7 @@ config XFS_FS
        select EXPORTFS
        select LIBCRC32C
        select FS_IOMAP
+       select TRANSPARENT_HUGEPAGE
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can

OR

We create a helper in page cache that FSs can use to check if a specific
order can be supported at mount time:

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 14e1415f7dcf..9be775ef11a5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -374,6 +374,14 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 #define MAX_XAS_ORDER          (XA_CHUNK_SHIFT * 2 - 1)
 #define MAX_PAGECACHE_ORDER    min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
 
+
+static inline unsigned int mapping_max_folio_order_supported()
+{
+    if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+      return 0;
+    return MAX_PAGECACHE_ORDER;
+}
+


diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b8a93a8f35cac..e2be8743c2c20 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1647,6 +1647,15 @@ xfs_fs_fill_super(
                        goto out_free_sb;
                }
 
+               if (mp->m_sb.sb_blocklog - PAGE_SHIFT >
+                   mapping_max_folio_order_supported()) {
+                       xfs_warn(mp,
+"Block Size (%d bytes) is not supported. Check MAX_PAGECACHE_ORDER",
+                       mp->m_sb.sb_blocksize);
+                       error = -ENOSYS;
+                       goto out_free_sb;
+               }
+
                xfs_warn(mp,
 "EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.",
                        mp->m_sb.sb_blocksize);


--
Pankaj
Matthew Wilcox July 9, 2024, 4:38 p.m. UTC | #19
On Tue, Jul 09, 2024 at 04:29:07PM +0000, Pankaj Raghav (Samsung) wrote:
> +++ b/include/linux/pagemap.h
> @@ -394,13 +394,24 @@ static inline void mapping_set_folio_order_range(struct address_space *mapping,
>                                                  unsigned int min,
>                                                  unsigned int max)
>  {
> -       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> +       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> +               VM_WARN_ONCE(1, 
> +       "THP needs to be enabled to support mapping folio order range");
>                 return;
> +       }

No.  Filesystems call mapping_set_folio_order_range() without it being
conditional on CONFIG_TRANSPARENT_HUGEPAGE.  Usually that takes the
form of an unconditional call to mapping_set_large_folios().
Darrick J. Wong July 9, 2024, 4:50 p.m. UTC | #20
On Tue, Jul 09, 2024 at 04:29:07PM +0000, Pankaj Raghav (Samsung) wrote:
> For now, this is the only patch that is blocking for the next version.
> 
> Based on the discussion, is the following logical @ryan, @dave and
> @willy?
> 
> - We give explicit VM_WARN_ONCE if we try to set folio order range if
>   the THP is disabled, min and max is greater than MAX_PAGECACHE_ORDER.
> 
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 14e1415f7dcf4..313c9fad61859 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -394,13 +394,24 @@ static inline void mapping_set_folio_order_range(struct address_space *mapping,
>                                                  unsigned int min,
>                                                  unsigned int max)
>  {
> -       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> +       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> +               VM_WARN_ONCE(1, 
> +       "THP needs to be enabled to support mapping folio order range");
>                 return;
> +       }
>  
> -       if (min > MAX_PAGECACHE_ORDER)
> +       if (min > MAX_PAGECACHE_ORDER) {
> +               VM_WARN_ONCE(1, 
> +       "min order > MAX_PAGECACHE_ORDER. Setting min_order to MAX_PAGECACHE_ORDER");
>                 min = MAX_PAGECACHE_ORDER;
> -       if (max > MAX_PAGECACHE_ORDER)
> +       }
> +
> +       if (max > MAX_PAGECACHE_ORDER) {
> +               VM_WARN_ONCE(1, 
> +       "max order > MAX_PAGECACHE_ORDER. Setting max_order to MAX_PAGECACHE_ORDER");
>                 max = MAX_PAGECACHE_ORDER;
> +       }
> +
>         if (max < min)
>                 max = min;
> 
> - We make THP an explicit dependency for XFS:
> 
> diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
> index d41edd30388b7..be2c1c0e9fe8b 100644
> --- a/fs/xfs/Kconfig
> +++ b/fs/xfs/Kconfig
> @@ -5,6 +5,7 @@ config XFS_FS
>         select EXPORTFS
>         select LIBCRC32C
>         select FS_IOMAP
> +       select TRANSPARENT_HUGEPAGE
>         help
>           XFS is a high performance journaling filesystem which originated
>           on the SGI IRIX platform.  It is completely multi-threaded, can
> 
> OR
> 
> We create a helper in page cache that FSs can use to check if a specific
> order can be supported at mount time:

I like this solution better; if XFS is going to drop support for o[ld]d
architectures I think we need /some/ sort of notice period.  Or at least
a better story than "we want to support 64k fsblocks on x64 so we're
withdrawing support even for 4k fsblocks and smallish filesystems on
m68k".

You probably don't want bs>ps support to block on some arcane discussion
about 32-bit, right? ;)

> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 14e1415f7dcf..9be775ef11a5 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -374,6 +374,14 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
>  #define MAX_XAS_ORDER          (XA_CHUNK_SHIFT * 2 - 1)
>  #define MAX_PAGECACHE_ORDER    min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
>  
> +
> +static inline unsigned int mapping_max_folio_order_supported()
> +{
> +    if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> +      return 0;

Shouldn't this line be indented by two tabs, not six spaces?

> +    return MAX_PAGECACHE_ORDER;
> +}

Alternately, should this return the max folio size in bytes?

static inline size_t mapping_max_folio_size(void)
{
	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
		return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
	return PAGE_SIZE;
}

Then the validation looks like:

	const size_t	max_folio_size = mapping_max_folio_size();

	if (mp->m_sb.sb_blocksize > max_folio_size) {
		xfs_warn(mp,
 "block size (%u bytes) not supported; maximum folio size is %u.",
				mp->m_sb.sb_blocksize, max_folio_size);
		error = -ENOSYS;
		goto out_free_sb;
	}

(Don't mind me bikeshedding here.)

> +
> 
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b8a93a8f35cac..e2be8743c2c20 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1647,6 +1647,15 @@ xfs_fs_fill_super(
>                         goto out_free_sb;
>                 }
>  
> +               if (mp->m_sb.sb_blocklog - PAGE_SHIFT >
> +                   mapping_max_folio_order_supported()) {
> +                       xfs_warn(mp,
> +"Block Size (%d bytes) is not supported. Check MAX_PAGECACHE_ORDER",
> +                       mp->m_sb.sb_blocksize);

You might as well print MAX_PAGECACHE_ORDER here to make analysis
easier on less-familiar architectures:

			xfs_warn(mp,
 "block size (%d bytes) is not supported; max folio size is %u.",
					mp->m_sb.sb_blocksize,
					1U << mapping_max_folio_order_supported());

(I wrote this comment first.)

--D

> +                       error = -ENOSYS;
> +                       goto out_free_sb;
> +               }
> +
>                 xfs_warn(mp,
>  "EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.",
>                         mp->m_sb.sb_blocksize);
> 
> 
> --
> Pankaj
Pankaj Raghav (Samsung) July 9, 2024, 5:33 p.m. UTC | #21
On Tue, Jul 09, 2024 at 05:38:16PM +0100, Matthew Wilcox wrote:
> On Tue, Jul 09, 2024 at 04:29:07PM +0000, Pankaj Raghav (Samsung) wrote:
> > +++ b/include/linux/pagemap.h
> > @@ -394,13 +394,24 @@ static inline void mapping_set_folio_order_range(struct address_space *mapping,
> >                                                  unsigned int min,
> >                                                  unsigned int max)
> >  {
> > -       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> > +       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> > +               VM_WARN_ONCE(1, 
> > +       "THP needs to be enabled to support mapping folio order range");
> >                 return;
> > +       }
> 
> No.  Filesystems call mapping_set_folio_order_range() without it being
> conditional on CONFIG_TRANSPARENT_HUGEPAGE.  Usually that takes the
> form of an unconditional call to mapping_set_large_folios().

Ah, you are right.

Actually thinking more about it, we don't need VM_WARN_ONCE on
CONFIG_THP IS_ENABLED, because if we go the route where a FS will
call something like `mapping_max_folio_order_supported()` during mount
time, that will already return `0` as the maximum order that will be
supported.

So just something like this should be enough:
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 14e1415f7dcf..ef6b13854385 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -397,10 +397,18 @@ static inline void mapping_set_folio_order_range(struct address_space *mapping,
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;
 
-       if (min > MAX_PAGECACHE_ORDER)
+       if (min > MAX_PAGECACHE_ORDER) {
+               VM_WARN_ONCE(1, 
+       "min order > MAX_PAGECACHE_ORDER. Setting min_order to MAX_PAGECACHE_ORDER");
                min = MAX_PAGECACHE_ORDER;
-       if (max > MAX_PAGECACHE_ORDER)
+       }
+
+       if (max > MAX_PAGECACHE_ORDER) {
+               VM_WARN_ONCE(1, 
+       "max order > MAX_PAGECACHE_ORDER. Setting max_order to MAX_PAGECACHE_ORDER");
                max = MAX_PAGECACHE_ORDER;
+       }
+
        if (max < min)
                max = min;

If we have a helper such as mapping_max_folio_order_supported() that
could be invoked by FSs to see what page cache could support.

And FSs that call mapping_set_large_folios() as an optimization will not
see these random WARNINGS because we call this function with the actual
min and max range.

Let me know what you think.

--
Pankaj
Pankaj Raghav (Samsung) July 9, 2024, 9:08 p.m. UTC | #22
> > 
> > - We make THP an explicit dependency for XFS:
> > 
> > diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
> > index d41edd30388b7..be2c1c0e9fe8b 100644
> > --- a/fs/xfs/Kconfig
> > +++ b/fs/xfs/Kconfig
> > @@ -5,6 +5,7 @@ config XFS_FS
> >         select EXPORTFS
> >         select LIBCRC32C
> >         select FS_IOMAP
> > +       select TRANSPARENT_HUGEPAGE
> >         help
> >           XFS is a high performance journaling filesystem which originated
> >           on the SGI IRIX platform.  It is completely multi-threaded, can
> > 
> > OR
> > 
> > We create a helper in page cache that FSs can use to check if a specific
> > order can be supported at mount time:
> 
> I like this solution better; if XFS is going to drop support for o[ld]d
> architectures I think we need /some/ sort of notice period.  Or at least
> a better story than "we want to support 64k fsblocks on x64 so we're
> withdrawing support even for 4k fsblocks and smallish filesystems on
> m68k".
> 
> You probably don't want bs>ps support to block on some arcane discussion
> about 32-bit, right? ;)
> 

:)

> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index 14e1415f7dcf..9be775ef11a5 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -374,6 +374,14 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
> >  #define MAX_XAS_ORDER          (XA_CHUNK_SHIFT * 2 - 1)
> >  #define MAX_PAGECACHE_ORDER    min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
> >  
> > +
> > +static inline unsigned int mapping_max_folio_order_supported()
> > +{
> > +    if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> > +      return 0;
> 
> Shouldn't this line be indented by two tabs, not six spaces?
> 
> > +    return MAX_PAGECACHE_ORDER;
> > +}
> 
> Alternately, should this return the max folio size in bytes?
> 
> static inline size_t mapping_max_folio_size(void)
> {
> 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> 		return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
> 	return PAGE_SIZE;
> }

We already have mapping_max_folio_size(mapping) which returns the
maximum folio order set for that mapping. So this could be called as
mapping_max_folio_size_supported().

So we could just have mapping_max_folio_size_supported() instead of
having mapping_max_folio_order_supported as you suggest.

> 
> Then the validation looks like:
> 
> 	const size_t	max_folio_size = mapping_max_folio_size();
> 
> 	if (mp->m_sb.sb_blocksize > max_folio_size) {
> 		xfs_warn(mp,
>  "block size (%u bytes) not supported; maximum folio size is %u.",
> 				mp->m_sb.sb_blocksize, max_folio_size);
> 		error = -ENOSYS;
> 		goto out_free_sb;
> 	}
> 
> (Don't mind me bikeshedding here.)
> 
> > +
> > 
> > 
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index b8a93a8f35cac..e2be8743c2c20 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -1647,6 +1647,15 @@ xfs_fs_fill_super(
> >                         goto out_free_sb;
> >                 }
> >  
> > +               if (mp->m_sb.sb_blocklog - PAGE_SHIFT >
> > +                   mapping_max_folio_order_supported()) {
> > +                       xfs_warn(mp,
> > +"Block Size (%d bytes) is not supported. Check MAX_PAGECACHE_ORDER",
> > +                       mp->m_sb.sb_blocksize);
> 
> You might as well print MAX_PAGECACHE_ORDER here to make analysis
> easier on less-familiar architectures:

Yes!

> 
> 			xfs_warn(mp,
>  "block size (%d bytes) is not supported; max folio size is %u.",
> 					mp->m_sb.sb_blocksize,
> 					1U << mapping_max_folio_order_supported());
> 
> (I wrote this comment first.)

> 
> --D
> 
> > +                       error = -ENOSYS;
> > +                       goto out_free_sb;
> > +               }
> > +
> >                 xfs_warn(mp,
> >  "EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.",
> >                         mp->m_sb.sb_blocksize);
> > 
> > 
> > --
> > Pankaj
Darrick J. Wong July 9, 2024, 9:59 p.m. UTC | #23
On Tue, Jul 09, 2024 at 09:08:29PM +0000, Pankaj Raghav (Samsung) wrote:
> > > 
> > > - We make THP an explicit dependency for XFS:
> > > 
> > > diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
> > > index d41edd30388b7..be2c1c0e9fe8b 100644
> > > --- a/fs/xfs/Kconfig
> > > +++ b/fs/xfs/Kconfig
> > > @@ -5,6 +5,7 @@ config XFS_FS
> > >         select EXPORTFS
> > >         select LIBCRC32C
> > >         select FS_IOMAP
> > > +       select TRANSPARENT_HUGEPAGE
> > >         help
> > >           XFS is a high performance journaling filesystem which originated
> > >           on the SGI IRIX platform.  It is completely multi-threaded, can
> > > 
> > > OR
> > > 
> > > We create a helper in page cache that FSs can use to check if a specific
> > > order can be supported at mount time:
> > 
> > I like this solution better; if XFS is going to drop support for o[ld]d
> > architectures I think we need /some/ sort of notice period.  Or at least
> > a better story than "we want to support 64k fsblocks on x64 so we're
> > withdrawing support even for 4k fsblocks and smallish filesystems on
> > m68k".
> > 
> > You probably don't want bs>ps support to block on some arcane discussion
> > about 32-bit, right? ;)
> > 
> 
> :)
> 
> > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > > index 14e1415f7dcf..9be775ef11a5 100644
> > > --- a/include/linux/pagemap.h
> > > +++ b/include/linux/pagemap.h
> > > @@ -374,6 +374,14 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
> > >  #define MAX_XAS_ORDER          (XA_CHUNK_SHIFT * 2 - 1)
> > >  #define MAX_PAGECACHE_ORDER    min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
> > >  
> > > +
> > > +static inline unsigned int mapping_max_folio_order_supported()
> > > +{
> > > +    if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> > > +      return 0;
> > 
> > Shouldn't this line be indented by two tabs, not six spaces?
> > 
> > > +    return MAX_PAGECACHE_ORDER;
> > > +}
> > 
> > Alternately, should this return the max folio size in bytes?
> > 
> > static inline size_t mapping_max_folio_size(void)
> > {
> > 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> > 		return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
> > 	return PAGE_SIZE;
> > }
> 
> We already have mapping_max_folio_size(mapping) which returns the
> maximum folio order set for that mapping. So this could be called as
> mapping_max_folio_size_supported().
> 
> So we could just have mapping_max_folio_size_supported() instead of
> having mapping_max_folio_order_supported as you suggest.

<nod>

> > 
> > Then the validation looks like:
> > 
> > 	const size_t	max_folio_size = mapping_max_folio_size();
> > 
> > 	if (mp->m_sb.sb_blocksize > max_folio_size) {
> > 		xfs_warn(mp,
> >  "block size (%u bytes) not supported; maximum folio size is %u.",
> > 				mp->m_sb.sb_blocksize, max_folio_size);
> > 		error = -ENOSYS;
> > 		goto out_free_sb;
> > 	}
> > 
> > (Don't mind me bikeshedding here.)
> > 
> > > +
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > index b8a93a8f35cac..e2be8743c2c20 100644
> > > --- a/fs/xfs/xfs_super.c
> > > +++ b/fs/xfs/xfs_super.c
> > > @@ -1647,6 +1647,15 @@ xfs_fs_fill_super(
> > >                         goto out_free_sb;
> > >                 }
> > >  
> > > +               if (mp->m_sb.sb_blocklog - PAGE_SHIFT >
> > > +                   mapping_max_folio_order_supported()) {
> > > +                       xfs_warn(mp,
> > > +"Block Size (%d bytes) is not supported. Check MAX_PAGECACHE_ORDER",
> > > +                       mp->m_sb.sb_blocksize);
> > 
> > You might as well print MAX_PAGECACHE_ORDER here to make analysis
> > easier on less-familiar architectures:
> 
> Yes!

Thanks.

--D

> > 
> > 			xfs_warn(mp,
> >  "block size (%d bytes) is not supported; max folio size is %u.",
> > 					mp->m_sb.sb_blocksize,
> > 					1U << mapping_max_folio_order_supported());
> > 
> > (I wrote this comment first.)
> 
> > 
> > --D
> > 
> > > +                       error = -ENOSYS;
> > > +                       goto out_free_sb;
> > > +               }
> > > +
> > >                 xfs_warn(mp,
> > >  "EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.",
> > >                         mp->m_sb.sb_blocksize);
> > > 
> > > 
> > > --
> > > Pankaj
>
diff mbox series

Patch

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b71d581091f..0c51154cdb57 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,14 +204,21 @@  enum mapping_flags {
 	AS_EXITING	= 4, 	/* final truncate in progress */
 	/* writeback related tags are not used */
 	AS_NO_WRITEBACK_TAGS = 5,
-	AS_LARGE_FOLIO_SUPPORT = 6,
-	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
-	AS_STABLE_WRITES,	/* must wait for writeback before modifying
+	AS_RELEASE_ALWAYS = 6,	/* Call ->release_folio(), even if no private data */
+	AS_STABLE_WRITES = 7,	/* must wait for writeback before modifying
 				   folio contents */
-	AS_UNMOVABLE,		/* The mapping cannot be moved, ever */
-	AS_INACCESSIBLE,	/* Do not attempt direct R/W access to the mapping */
+	AS_UNMOVABLE = 8,	/* The mapping cannot be moved, ever */
+	AS_INACCESSIBLE = 9,	/* Do not attempt direct R/W access to the mapping */
+	/* Bits 16-25 are used for FOLIO_ORDER */
+	AS_FOLIO_ORDER_BITS = 5,
+	AS_FOLIO_ORDER_MIN = 16,
+	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
 };
 
+#define AS_FOLIO_ORDER_MASK     ((1u << AS_FOLIO_ORDER_BITS) - 1)
+#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_MASK << AS_FOLIO_ORDER_MIN)
+#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_MASK << AS_FOLIO_ORDER_MAX)
+
 /**
  * mapping_set_error - record a writeback error in the address_space
  * @mapping: the mapping in which an error should be set
@@ -360,9 +367,49 @@  static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 #define MAX_PAGECACHE_ORDER	8
 #endif
 
+/*
+ * mapping_set_folio_order_range() - Set the orders supported by a file.
+ * @mapping: The address space of the file.
+ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
+ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
+ *
+ * The filesystem should call this function in its inode constructor to
+ * indicate which base size (min) and maximum size (max) of folio the VFS
+ * can use to cache the contents of the file.  This should only be used
+ * if the filesystem needs special handling of folio sizes (ie there is
+ * something the core cannot know).
+ * Do not tune it based on, eg, i_size.
+ *
+ * Context: This should not be called while the inode is active as it
+ * is non-atomic.
+ */
+static inline void mapping_set_folio_order_range(struct address_space *mapping,
+						 unsigned int min,
+						 unsigned int max)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return;
+
+	if (min > MAX_PAGECACHE_ORDER)
+		min = MAX_PAGECACHE_ORDER;
+	if (max > MAX_PAGECACHE_ORDER)
+		max = MAX_PAGECACHE_ORDER;
+	if (max < min)
+		max = min;
+
+	mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
+		(min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
+}
+
+static inline void mapping_set_folio_min_order(struct address_space *mapping,
+					       unsigned int min)
+{
+	mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
+}
+
 /**
  * mapping_set_large_folios() - Indicate the file supports large folios.
- * @mapping: The file.
+ * @mapping: The address space of the file.
  *
  * The filesystem should call this function in its inode constructor to
  * indicate that the VFS can use large folios to cache the contents of
@@ -373,7 +420,23 @@  static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
  */
 static inline void mapping_set_large_folios(struct address_space *mapping)
 {
-	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+	mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
+}
+
+static inline
+unsigned int mapping_max_folio_order(const struct address_space *mapping)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return 0;
+	return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
+}
+
+static inline
+unsigned int mapping_min_folio_order(const struct address_space *mapping)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return 0;
+	return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
 }
 
 /*
@@ -386,16 +449,13 @@  static inline bool mapping_large_folio_support(struct address_space *mapping)
 	VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
 			"Anonymous mapping always supports large folio");
 
-	return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
-		test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+	return mapping_max_folio_order(mapping) > 0;
 }
 
 /* Return the maximum folio size for this pagecache mapping, in bytes. */
-static inline size_t mapping_max_folio_size(struct address_space *mapping)
+static inline size_t mapping_max_folio_size(const struct address_space *mapping)
 {
-	if (mapping_large_folio_support(mapping))
-		return PAGE_SIZE << MAX_PAGECACHE_ORDER;
-	return PAGE_SIZE;
+	return PAGE_SIZE << mapping_max_folio_order(mapping);
 }
 
 static inline int filemap_nr_thps(struct address_space *mapping)
diff --git a/mm/filemap.c b/mm/filemap.c
index 0b8c732bb643..d617c9afca51 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1933,10 +1933,8 @@  struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
 			fgp_flags |= FGP_LOCK;
 
-		if (!mapping_large_folio_support(mapping))
-			order = 0;
-		if (order > MAX_PAGECACHE_ORDER)
-			order = MAX_PAGECACHE_ORDER;
+		if (order > mapping_max_folio_order(mapping))
+			order = mapping_max_folio_order(mapping);
 		/* If we're not aligned, allocate a smaller folio */
 		if (index & ((1UL << order) - 1))
 			order = __ffs(index);
diff --git a/mm/readahead.c b/mm/readahead.c
index c1b23989d9ca..66058ae02f2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -503,9 +503,9 @@  void page_cache_ra_order(struct readahead_control *ractl,
 
 	limit = min(limit, index + ra->size - 1);
 
-	if (new_order < MAX_PAGECACHE_ORDER) {
+	if (new_order < mapping_max_folio_order(mapping)) {
 		new_order += 2;
-		new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+		new_order = min(mapping_max_folio_order(mapping), new_order);
 		new_order = min_t(unsigned int, new_order, ilog2(ra->size));
 	}