diff mbox series

[2/3] xfs: use folios in the buffer cache

Message ID 20240118222216.4131379-3-david@fromorbit.com (mailing list archive)
State New
Headers show
Series xfs: use large folios for buffers | expand

Commit Message

Dave Chinner Jan. 18, 2024, 10:19 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Convert the use of struct pages to struct folio everywhere. This
is just direct API conversion, no actual logic of code changes
should result.

Note: this conversion currently assumes only single page folios are
allocated, and because some of the MM interfaces we use take
pointers to arrays of struct pages, the address of single page
folios and struct pages are the same. e.g alloc_pages_bulk_array(),
vm_map_ram(), etc.


Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_buf.c      | 127 +++++++++++++++++++++---------------------
 fs/xfs/xfs_buf.h      |  14 ++---
 fs/xfs/xfs_buf_item.c |   2 +-
 fs/xfs/xfs_linux.h    |   8 +++
 4 files changed, 80 insertions(+), 71 deletions(-)

Comments

Darrick J. Wong Jan. 19, 2024, 1:26 a.m. UTC | #1
On Fri, Jan 19, 2024 at 09:19:40AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Convert the use of struct pages to struct folio everywhere. This
> is just direct API conversion, no actual logic of code changes
> should result.
> 
> Note: this conversion currently assumes only single page folios are
> allocated, and because some of the MM interfaces we use take
> pointers to arrays of struct pages, the address of single page
> folios and struct pages are the same. e.g alloc_pages_bulk_array(),
> vm_map_ram(), etc.
> 
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_buf.c      | 127 +++++++++++++++++++++---------------------
>  fs/xfs/xfs_buf.h      |  14 ++---
>  fs/xfs/xfs_buf_item.c |   2 +-
>  fs/xfs/xfs_linux.h    |   8 +++
>  4 files changed, 80 insertions(+), 71 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 08f2fbc04db5..15907e92d0d3 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -60,25 +60,25 @@ xfs_buf_submit(
>  	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
>  }
>  
> +/*
> + * Return true if the buffer is vmapped.
> + *
> + * b_addr is null if the buffer is not mapped, but the code is clever enough to
> + * know it doesn't have to map a single folio, so the check has to be both for
> + * b_addr and bp->b_folio_count > 1.
> + */
>  static inline int
>  xfs_buf_is_vmapped(
>  	struct xfs_buf	*bp)
>  {
> -	/*
> -	 * Return true if the buffer is vmapped.
> -	 *
> -	 * b_addr is null if the buffer is not mapped, but the code is clever
> -	 * enough to know it doesn't have to map a single page, so the check has
> -	 * to be both for b_addr and bp->b_page_count > 1.
> -	 */
> -	return bp->b_addr && bp->b_page_count > 1;
> +	return bp->b_addr && bp->b_folio_count > 1;
>  }
>  
>  static inline int
>  xfs_buf_vmap_len(
>  	struct xfs_buf	*bp)
>  {
> -	return (bp->b_page_count * PAGE_SIZE);
> +	return (bp->b_folio_count * PAGE_SIZE);
>  }
>  
>  /*
> @@ -197,7 +197,7 @@ xfs_buf_get_maps(
>  }
>  
>  /*
> - *	Frees b_pages if it was allocated.
> + *	Frees b_maps if it was allocated.
>   */
>  static void
>  xfs_buf_free_maps(
> @@ -273,26 +273,26 @@ _xfs_buf_alloc(
>  }
>  
>  static void
> -xfs_buf_free_pages(
> +xfs_buf_free_folios(
>  	struct xfs_buf	*bp)
>  {
>  	uint		i;
>  
> -	ASSERT(bp->b_flags & _XBF_PAGES);
> +	ASSERT(bp->b_flags & _XBF_FOLIOS);
>  
>  	if (xfs_buf_is_vmapped(bp))
> -		vm_unmap_ram(bp->b_addr, bp->b_page_count);
> +		vm_unmap_ram(bp->b_addr, bp->b_folio_count);
>  
> -	for (i = 0; i < bp->b_page_count; i++) {
> -		if (bp->b_pages[i])
> -			__free_page(bp->b_pages[i]);
> +	for (i = 0; i < bp->b_folio_count; i++) {
> +		if (bp->b_folios[i])
> +			__folio_put(bp->b_folios[i]);
>  	}
> -	mm_account_reclaimed_pages(bp->b_page_count);
> +	mm_account_reclaimed_pages(bp->b_folio_count);
>  
> -	if (bp->b_pages != bp->b_page_array)
> -		kfree(bp->b_pages);
> -	bp->b_pages = NULL;
> -	bp->b_flags &= ~_XBF_PAGES;
> +	if (bp->b_folios != bp->b_folio_array)
> +		kfree(bp->b_folios);
> +	bp->b_folios = NULL;
> +	bp->b_flags &= ~_XBF_FOLIOS;
>  }
>  
>  static void
> @@ -313,8 +313,8 @@ xfs_buf_free(
>  
>  	ASSERT(list_empty(&bp->b_lru));
>  
> -	if (bp->b_flags & _XBF_PAGES)
> -		xfs_buf_free_pages(bp);
> +	if (bp->b_flags & _XBF_FOLIOS)
> +		xfs_buf_free_folios(bp);
>  	else if (bp->b_flags & _XBF_KMEM)
>  		kfree(bp->b_addr);
>  
> @@ -345,15 +345,15 @@ xfs_buf_alloc_kmem(
>  		return -ENOMEM;
>  	}
>  	bp->b_offset = offset_in_page(bp->b_addr);
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_pages[0] = kmem_to_page(bp->b_addr);
> -	bp->b_page_count = 1;
> +	bp->b_folios = bp->b_folio_array;
> +	bp->b_folios[0] = kmem_to_folio(bp->b_addr);
> +	bp->b_folio_count = 1;
>  	bp->b_flags |= _XBF_KMEM;
>  	return 0;
>  }
>  
>  static int
> -xfs_buf_alloc_pages(
> +xfs_buf_alloc_folios(
>  	struct xfs_buf	*bp,
>  	xfs_buf_flags_t	flags)
>  {
> @@ -364,16 +364,16 @@ xfs_buf_alloc_pages(
>  		gfp_mask |= __GFP_NORETRY;
>  
>  	/* Make sure that we have a page list */
> -	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
> -	if (bp->b_page_count <= XB_PAGES) {
> -		bp->b_pages = bp->b_page_array;
> +	bp->b_folio_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
> +	if (bp->b_folio_count <= XB_FOLIOS) {
> +		bp->b_folios = bp->b_folio_array;
>  	} else {
> -		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
> +		bp->b_folios = kzalloc(sizeof(struct folio *) * bp->b_folio_count,
>  					gfp_mask);
> -		if (!bp->b_pages)
> +		if (!bp->b_folios)
>  			return -ENOMEM;
>  	}
> -	bp->b_flags |= _XBF_PAGES;
> +	bp->b_flags |= _XBF_FOLIOS;
>  
>  	/* Assure zeroed buffer for non-read cases. */
>  	if (!(flags & XBF_READ))
> @@ -387,9 +387,9 @@ xfs_buf_alloc_pages(
>  	for (;;) {
>  		long	last = filled;
>  
> -		filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
> -						bp->b_pages);
> -		if (filled == bp->b_page_count) {
> +		filled = alloc_pages_bulk_array(gfp_mask, bp->b_folio_count,
> +						(struct page **)bp->b_folios);

Ugh, pointer casting.  I suppose here is where we might want an
alloc_folio_bulk_array that might give us successively smaller
large-folios until b_page_count is satisfied?  (Maybe that's in the next
patch?)

I guess you'd also need a large-folio capable vm_map_ram.  Both of
these things sound reasonable, particularly if somebody wants to write
us a new buffer cache for ext2rs and support large block sizes.

Assuming that one of the goals here is (say) to be able to mount a 16k
blocksize filesystem and try to get 16k folios for the buffer cache?

--D

> +		if (filled == bp->b_folio_count) {
>  			XFS_STATS_INC(bp->b_mount, xb_page_found);
>  			break;
>  		}
> @@ -398,7 +398,7 @@ xfs_buf_alloc_pages(
>  			continue;
>  
>  		if (flags & XBF_READ_AHEAD) {
> -			xfs_buf_free_pages(bp);
> +			xfs_buf_free_folios(bp);
>  			return -ENOMEM;
>  		}
>  
> @@ -412,14 +412,14 @@ xfs_buf_alloc_pages(
>   *	Map buffer into kernel address-space if necessary.
>   */
>  STATIC int
> -_xfs_buf_map_pages(
> +_xfs_buf_map_folios(
>  	struct xfs_buf		*bp,
>  	xfs_buf_flags_t		flags)
>  {
> -	ASSERT(bp->b_flags & _XBF_PAGES);
> -	if (bp->b_page_count == 1) {
> +	ASSERT(bp->b_flags & _XBF_FOLIOS);
> +	if (bp->b_folio_count == 1) {
>  		/* A single page buffer is always mappable */
> -		bp->b_addr = page_address(bp->b_pages[0]);
> +		bp->b_addr = folio_address(bp->b_folios[0]);
>  	} else if (flags & XBF_UNMAPPED) {
>  		bp->b_addr = NULL;
>  	} else {
> @@ -443,8 +443,8 @@ _xfs_buf_map_pages(
>  		 */
>  		nofs_flag = memalloc_nofs_save();
>  		do {
> -			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
> -						-1);
> +			bp->b_addr = vm_map_ram((struct page **)bp->b_folios,
> +					bp->b_folio_count, -1);
>  			if (bp->b_addr)
>  				break;
>  			vm_unmap_aliases();
> @@ -571,7 +571,7 @@ xfs_buf_find_lock(
>  			return -ENOENT;
>  		}
>  		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
> -		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
> +		bp->b_flags &= _XBF_KMEM | _XBF_FOLIOS;
>  		bp->b_ops = NULL;
>  	}
>  	return 0;
> @@ -629,14 +629,15 @@ xfs_buf_find_insert(
>  		goto out_drop_pag;
>  
>  	/*
> -	 * For buffers that fit entirely within a single page, first attempt to
> -	 * allocate the memory from the heap to minimise memory usage. If we
> -	 * can't get heap memory for these small buffers, we fall back to using
> -	 * the page allocator.
> +	 * For buffers that fit entirely within a single page folio, first
> +	 * attempt to allocate the memory from the heap to minimise memory
> +	 * usage. If we can't get heap memory for these small buffers, we fall
> +	 * back to using the page allocator.
>  	 */
> +
>  	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
>  	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
> -		error = xfs_buf_alloc_pages(new_bp, flags);
> +		error = xfs_buf_alloc_folios(new_bp, flags);
>  		if (error)
>  			goto out_free_buf;
>  	}
> @@ -728,11 +729,11 @@ xfs_buf_get_map(
>  
>  	/* We do not hold a perag reference anymore. */
>  	if (!bp->b_addr) {
> -		error = _xfs_buf_map_pages(bp, flags);
> +		error = _xfs_buf_map_folios(bp, flags);
>  		if (unlikely(error)) {
>  			xfs_warn_ratelimited(btp->bt_mount,
> -				"%s: failed to map %u pages", __func__,
> -				bp->b_page_count);
> +				"%s: failed to map %u folios", __func__,
> +				bp->b_folio_count);
>  			xfs_buf_relse(bp);
>  			return error;
>  		}
> @@ -963,14 +964,14 @@ xfs_buf_get_uncached(
>  	if (error)
>  		return error;
>  
> -	error = xfs_buf_alloc_pages(bp, flags);
> +	error = xfs_buf_alloc_folios(bp, flags);
>  	if (error)
>  		goto fail_free_buf;
>  
> -	error = _xfs_buf_map_pages(bp, 0);
> +	error = _xfs_buf_map_folios(bp, 0);
>  	if (unlikely(error)) {
>  		xfs_warn(target->bt_mount,
> -			"%s: failed to map pages", __func__);
> +			"%s: failed to map folios", __func__);
>  		goto fail_free_buf;
>  	}
>  
> @@ -1465,7 +1466,7 @@ xfs_buf_ioapply_map(
>  	blk_opf_t	op)
>  {
>  	int		page_index;
> -	unsigned int	total_nr_pages = bp->b_page_count;
> +	unsigned int	total_nr_pages = bp->b_folio_count;
>  	int		nr_pages;
>  	struct bio	*bio;
>  	sector_t	sector =  bp->b_maps[map].bm_bn;
> @@ -1503,7 +1504,7 @@ xfs_buf_ioapply_map(
>  		if (nbytes > size)
>  			nbytes = size;
>  
> -		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
> +		rbytes = bio_add_folio(bio, bp->b_folios[page_index], nbytes,
>  				      offset);
>  		if (rbytes < nbytes)
>  			break;
> @@ -1716,13 +1717,13 @@ xfs_buf_offset(
>  	struct xfs_buf		*bp,
>  	size_t			offset)
>  {
> -	struct page		*page;
> +	struct folio		*folio;
>  
>  	if (bp->b_addr)
>  		return bp->b_addr + offset;
>  
> -	page = bp->b_pages[offset >> PAGE_SHIFT];
> -	return page_address(page) + (offset & (PAGE_SIZE-1));
> +	folio = bp->b_folios[offset >> PAGE_SHIFT];
> +	return folio_address(folio) + (offset & (PAGE_SIZE-1));
>  }
>  
>  void
> @@ -1735,18 +1736,18 @@ xfs_buf_zero(
>  
>  	bend = boff + bsize;
>  	while (boff < bend) {
> -		struct page	*page;
> +		struct folio	*folio;
>  		int		page_index, page_offset, csize;
>  
>  		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
>  		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
> -		page = bp->b_pages[page_index];
> +		folio = bp->b_folios[page_index];
>  		csize = min_t(size_t, PAGE_SIZE - page_offset,
>  				      BBTOB(bp->b_length) - boff);
>  
>  		ASSERT((csize + page_offset) <= PAGE_SIZE);
>  
> -		memset(page_address(page) + page_offset, 0, csize);
> +		memset(folio_address(folio) + page_offset, 0, csize);
>  
>  		boff += csize;
>  	}
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index b470de08a46c..1e7298ff3fa5 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -29,7 +29,7 @@ struct xfs_buf;
>  #define XBF_READ_AHEAD	 (1u << 2) /* asynchronous read-ahead */
>  #define XBF_NO_IOACCT	 (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
>  #define XBF_ASYNC	 (1u << 4) /* initiator will not wait for completion */
> -#define XBF_DONE	 (1u << 5) /* all pages in the buffer uptodate */
> +#define XBF_DONE	 (1u << 5) /* all folios in the buffer uptodate */
>  #define XBF_STALE	 (1u << 6) /* buffer has been staled, do not find it */
>  #define XBF_WRITE_FAIL	 (1u << 7) /* async writes have failed on this buffer */
>  
> @@ -39,7 +39,7 @@ struct xfs_buf;
>  #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
>  
>  /* flags used only internally */
> -#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
> +#define _XBF_FOLIOS	 (1u << 20)/* backed by refcounted folios */
>  #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
>  #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
>  
> @@ -68,7 +68,7 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ _XBF_INODES,		"INODES" }, \
>  	{ _XBF_DQUOTS,		"DQUOTS" }, \
>  	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
> -	{ _XBF_PAGES,		"PAGES" }, \
> +	{ _XBF_FOLIOS,		"FOLIOS" }, \
>  	{ _XBF_KMEM,		"KMEM" }, \
>  	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
>  	/* The following interface flags should never be set */ \
> @@ -116,7 +116,7 @@ typedef struct xfs_buftarg {
>  	struct ratelimit_state	bt_ioerror_rl;
>  } xfs_buftarg_t;
>  
> -#define XB_PAGES	2
> +#define XB_FOLIOS	2
>  
>  struct xfs_buf_map {
>  	xfs_daddr_t		bm_bn;	/* block number for I/O */
> @@ -180,14 +180,14 @@ struct xfs_buf {
>  	struct xfs_buf_log_item	*b_log_item;
>  	struct list_head	b_li_list;	/* Log items list head */
>  	struct xfs_trans	*b_transp;
> -	struct page		**b_pages;	/* array of page pointers */
> -	struct page		*b_page_array[XB_PAGES]; /* inline pages */
> +	struct folio		**b_folios;	/* array of folio pointers */
> +	struct folio		*b_folio_array[XB_FOLIOS]; /* inline folios */
>  	struct xfs_buf_map	*b_maps;	/* compound buffer map */
>  	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
>  	int			b_map_count;
>  	atomic_t		b_pin_count;	/* pin count */
>  	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
> -	unsigned int		b_page_count;	/* size of page array */
> +	unsigned int		b_folio_count;	/* size of folio array */
>  	unsigned int		b_offset;	/* page offset of b_addr,
>  						   only for _XBF_KMEM buffers */
>  	int			b_error;	/* error code on I/O */
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 83a81cb52d8e..d1407cee48d9 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -69,7 +69,7 @@ xfs_buf_item_straddle(
>  {
>  	void			*first, *last;
>  
> -	if (bp->b_page_count == 1 || !(bp->b_flags & XBF_UNMAPPED))
> +	if (bp->b_folio_count == 1 || !(bp->b_flags & XBF_UNMAPPED))
>  		return false;
>  
>  	first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
> diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> index caccb7f76690..804389b8e802 100644
> --- a/fs/xfs/xfs_linux.h
> +++ b/fs/xfs/xfs_linux.h
> @@ -279,4 +279,12 @@ kmem_to_page(void *addr)
>  	return virt_to_page(addr);
>  }
>  
> +static inline struct folio *
> +kmem_to_folio(void *addr)
> +{
> +	if (is_vmalloc_addr(addr))
> +		return page_folio(vmalloc_to_page(addr));
> +	return virt_to_folio(addr);
> +}
> +
>  #endif /* __XFS_LINUX__ */
> -- 
> 2.43.0
> 
>
Dave Chinner Jan. 19, 2024, 7:03 a.m. UTC | #2
On Thu, Jan 18, 2024 at 05:26:24PM -0800, Darrick J. Wong wrote:
> On Fri, Jan 19, 2024 at 09:19:40AM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Convert the use of struct pages to struct folio everywhere. This
> > is just direct API conversion, no actual logic of code changes
> > should result.
> > 
> > Note: this conversion currently assumes only single page folios are
> > allocated, and because some of the MM interfaces we use take
> > pointers to arrays of struct pages, the address of single page
> > folios and struct pages are the same. e.g alloc_pages_bulk_array(),
> > vm_map_ram(), etc.
> > 
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
.....
> > @@ -387,9 +387,9 @@ xfs_buf_alloc_pages(
> >  	for (;;) {
> >  		long	last = filled;
> >  
> > -		filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
> > -						bp->b_pages);
> > -		if (filled == bp->b_page_count) {
> > +		filled = alloc_pages_bulk_array(gfp_mask, bp->b_folio_count,
> > +						(struct page **)bp->b_folios);
> 
> Ugh, pointer casting.  I suppose here is where we might want an
> alloc_folio_bulk_array that might give us successively smaller
> large-folios until b_page_count is satisfied?  (Maybe that's in the next
> patch?)

No, I explicitly chose not to do that because then converting buffer
offset to memory address becomes excitingly complex. With fixed size
folios, it's just simple math. With variable, unknown sized objects,
we either have to store the size of each object with the pointer,
or walk each object grabbing the size to determine what folio in the
buffer corresponds to a specific offset.

And it's now the slow path, so I don't really care to optimise it
that much.

> I guess you'd also need a large-folio capable vm_map_ram.  Both of
> these things sound reasonable, particularly if somebody wants to write
> us a new buffer cache for ext2rs and support large block sizes.

Maybe so, but we do not require them and I don't really have the
time or desire to try to implement something like that. And, really
what benefit do small multipage folios bring us if we still have to
vmap them?

> Assuming that one of the goals here is (say) to be able to mount a 16k
> blocksize filesystem and try to get 16k folios for the buffer cache?

The goal is that we optimistically use large folios where-ever we
have metadata buffers that are larger than a single page, regardless
of the filesystem block size.

Right now on a 4kB block size filesystem that means inode cluster
buffers (16kB for 512 byte inodes), user xattr buffers larger than a
single page, and directory blocks if the filesytsem is configure
with "-n size=X" and X is 8kB or larger.

On filesystems with block sizes larger than 4kB, it will try to use
large folios for everything but the sector sized AG headers.

-Dave.
Christoph Hellwig Jan. 22, 2024, 6:39 a.m. UTC | #3
On Thu, Jan 18, 2024 at 05:26:24PM -0800, Darrick J. Wong wrote:
> Ugh, pointer casting.  I suppose here is where we might want an
> alloc_folio_bulk_array that might give us successively smaller
> large-folios until b_page_count is satisfied?  (Maybe that's in the next
> patch?)
> 
> I guess you'd also need a large-folio capable vm_map_ram. 

We need to just stop using vm_map_ram, there is no reason to do that
even right now.  It was needed when we used the page cache to back
pagebuf, but these days just sing vmalloc is the right thing for
!unmapped buffers that can't use large folios.  And I'm seriously
wondering if we should bother with unmapped buffers in the long run
if we end up normally using larger folios or just consolidate down to:

 - kmalloc for buffers < PAGE_SIZE
 - folio for buffers >= PAGE_SIZE
 - vmalloc if allocation a larger folios is not possible
Dave Chinner Jan. 22, 2024, 12:05 p.m. UTC | #4
On Sun, Jan 21, 2024 at 10:39:12PM -0800, Christoph Hellwig wrote:
> On Thu, Jan 18, 2024 at 05:26:24PM -0800, Darrick J. Wong wrote:
> > Ugh, pointer casting.  I suppose here is where we might want an
> > alloc_folio_bulk_array that might give us successively smaller
> > large-folios until b_page_count is satisfied?  (Maybe that's in the next
> > patch?)
> > 
> > I guess you'd also need a large-folio capable vm_map_ram. 
> 
> We need to just stop using vm_map_ram, there is no reason to do that
> even right now.  It was needed when we used the page cache to back
> pagebuf, but these days just sing vmalloc is the right thing for
> !unmapped buffers that can't use large folios. 

I haven't looked at what using vmalloc means for packing the buffer
into a bio - we currently use bio_add_page(), so does that mean we
have to use some variant of virt_to_page() to break the vmalloc
region up into it's backing pages to feed them to the bio? Or is
there some helper that I'm unaware of that does it all for us
magically?

> And I'm seriously
> wondering if we should bother with unmapped buffers in the long run
> if we end up normally using larger folios or just consolidate down to:
> 
>  - kmalloc for buffers < PAGE_SIZE
>  - folio for buffers >= PAGE_SIZE
>  - vmalloc if allocation a larger folios is not possible

Yeah, that's kind of where I'm going with this. Large folios already
turn off unmapped buffers, and I'd really like to get rid of that
page straddling mess that unmapped buffers require in the buffer
item dirty region tracking. That means we have to get rid of
unmapped buffers....

-Dave.
Christoph Hellwig Jan. 22, 2024, 1:18 p.m. UTC | #5
On Mon, Jan 22, 2024 at 11:05:20PM +1100, Dave Chinner wrote:
> I haven't looked at what using vmalloc means for packing the buffer
> into a bio - we currently use bio_add_page(), so does that mean we
> have to use some variant of virt_to_page() to break the vmalloc
> region up into it's backing pages to feed them to the bio? Or is
> there some helper that I'm unaware of that does it all for us
> magically?

We have a kmem_to_page helper for chuking any kind of kernel virtual
address space into pages.  xfs_rw_bdev in fs/xfs/xfs_bio_io.c uses
that for a bio, we should probably hav an async version of that
and maybe move it to the block layer instead of duplicating the
logic in various places.

> Yeah, that's kind of where I'm going with this. Large folios already
> turn off unmapped buffers, and I'd really like to get rid of that
> page straddling mess that unmapped buffers require in the buffer
> item dirty region tracking. That means we have to get rid of
> unmapped buffers....

I actually have an old series to kill unmapped buffers and use
vmalloc, but decided I'd need use folios for the fast path instead
of paying the vmalloc overhead.  I can dust it off and you can decide
if you want to pick up parts of it.
Dave Chinner Jan. 22, 2024, 9:10 p.m. UTC | #6
On Mon, Jan 22, 2024 at 05:18:58AM -0800, Christoph Hellwig wrote:
> On Mon, Jan 22, 2024 at 11:05:20PM +1100, Dave Chinner wrote:
> > I haven't looked at what using vmalloc means for packing the buffer
> > into a bio - we currently use bio_add_page(), so does that mean we
> > have to use some variant of virt_to_page() to break the vmalloc
> > region up into it's backing pages to feed them to the bio? Or is
> > there some helper that I'm unaware of that does it all for us
> > magically?
> 
> We have a kmem_to_page helper for chuking any kind of kernel virtual
> address space into pages.  xfs_rw_bdev in fs/xfs/xfs_bio_io.c uses
> that for a bio, we should probably hav an async version of that
> and maybe move it to the block layer instead of duplicating the
> logic in various places.

Yeah, OK, as I expected. I'd forgotten that we already play that
game with xfs_rw_bdev(). I think we can trivially factor out an
async version and call that from the xfs_buf.c code fo vmalloc()d
ranges, so I think I'll work towards that and actually remove the
bio packing loop from xfs_buf.c altogether.

> > Yeah, that's kind of where I'm going with this. Large folios already
> > turn off unmapped buffers, and I'd really like to get rid of that
> > page straddling mess that unmapped buffers require in the buffer
> > item dirty region tracking. That means we have to get rid of
> > unmapped buffers....
> 
> I actually have an old series to kill unmapped buffers and use
> vmalloc, but decided I'd need use folios for the fast path instead
> of paying the vmalloc overhead.  I can dust it off and you can decide
> if you want to pick up parts of it.

I wouldn't worry about it too much - the rest of it is pretty
straight forward once we know inode cluster buffers are working
correctly with single large folios and we always fall back to
vmalloc().

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 08f2fbc04db5..15907e92d0d3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -60,25 +60,25 @@  xfs_buf_submit(
 	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
 }
 
+/*
+ * Return true if the buffer is vmapped.
+ *
+ * b_addr is null if the buffer is not mapped, but the code is clever enough to
+ * know it doesn't have to map a single folio, so the check has to be both for
+ * b_addr and bp->b_folio_count > 1.
+ */
 static inline int
 xfs_buf_is_vmapped(
 	struct xfs_buf	*bp)
 {
-	/*
-	 * Return true if the buffer is vmapped.
-	 *
-	 * b_addr is null if the buffer is not mapped, but the code is clever
-	 * enough to know it doesn't have to map a single page, so the check has
-	 * to be both for b_addr and bp->b_page_count > 1.
-	 */
-	return bp->b_addr && bp->b_page_count > 1;
+	return bp->b_addr && bp->b_folio_count > 1;
 }
 
 static inline int
 xfs_buf_vmap_len(
 	struct xfs_buf	*bp)
 {
-	return (bp->b_page_count * PAGE_SIZE);
+	return (bp->b_folio_count * PAGE_SIZE);
 }
 
 /*
@@ -197,7 +197,7 @@  xfs_buf_get_maps(
 }
 
 /*
- *	Frees b_pages if it was allocated.
+ *	Frees b_maps if it was allocated.
  */
 static void
 xfs_buf_free_maps(
@@ -273,26 +273,26 @@  _xfs_buf_alloc(
 }
 
 static void
-xfs_buf_free_pages(
+xfs_buf_free_folios(
 	struct xfs_buf	*bp)
 {
 	uint		i;
 
-	ASSERT(bp->b_flags & _XBF_PAGES);
+	ASSERT(bp->b_flags & _XBF_FOLIOS);
 
 	if (xfs_buf_is_vmapped(bp))
-		vm_unmap_ram(bp->b_addr, bp->b_page_count);
+		vm_unmap_ram(bp->b_addr, bp->b_folio_count);
 
-	for (i = 0; i < bp->b_page_count; i++) {
-		if (bp->b_pages[i])
-			__free_page(bp->b_pages[i]);
+	for (i = 0; i < bp->b_folio_count; i++) {
+		if (bp->b_folios[i])
+			__folio_put(bp->b_folios[i]);
 	}
-	mm_account_reclaimed_pages(bp->b_page_count);
+	mm_account_reclaimed_pages(bp->b_folio_count);
 
-	if (bp->b_pages != bp->b_page_array)
-		kfree(bp->b_pages);
-	bp->b_pages = NULL;
-	bp->b_flags &= ~_XBF_PAGES;
+	if (bp->b_folios != bp->b_folio_array)
+		kfree(bp->b_folios);
+	bp->b_folios = NULL;
+	bp->b_flags &= ~_XBF_FOLIOS;
 }
 
 static void
@@ -313,8 +313,8 @@  xfs_buf_free(
 
 	ASSERT(list_empty(&bp->b_lru));
 
-	if (bp->b_flags & _XBF_PAGES)
-		xfs_buf_free_pages(bp);
+	if (bp->b_flags & _XBF_FOLIOS)
+		xfs_buf_free_folios(bp);
 	else if (bp->b_flags & _XBF_KMEM)
 		kfree(bp->b_addr);
 
@@ -345,15 +345,15 @@  xfs_buf_alloc_kmem(
 		return -ENOMEM;
 	}
 	bp->b_offset = offset_in_page(bp->b_addr);
-	bp->b_pages = bp->b_page_array;
-	bp->b_pages[0] = kmem_to_page(bp->b_addr);
-	bp->b_page_count = 1;
+	bp->b_folios = bp->b_folio_array;
+	bp->b_folios[0] = kmem_to_folio(bp->b_addr);
+	bp->b_folio_count = 1;
 	bp->b_flags |= _XBF_KMEM;
 	return 0;
 }
 
 static int
-xfs_buf_alloc_pages(
+xfs_buf_alloc_folios(
 	struct xfs_buf	*bp,
 	xfs_buf_flags_t	flags)
 {
@@ -364,16 +364,16 @@  xfs_buf_alloc_pages(
 		gfp_mask |= __GFP_NORETRY;
 
 	/* Make sure that we have a page list */
-	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
-	if (bp->b_page_count <= XB_PAGES) {
-		bp->b_pages = bp->b_page_array;
+	bp->b_folio_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
+	if (bp->b_folio_count <= XB_FOLIOS) {
+		bp->b_folios = bp->b_folio_array;
 	} else {
-		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
+		bp->b_folios = kzalloc(sizeof(struct folio *) * bp->b_folio_count,
 					gfp_mask);
-		if (!bp->b_pages)
+		if (!bp->b_folios)
 			return -ENOMEM;
 	}
-	bp->b_flags |= _XBF_PAGES;
+	bp->b_flags |= _XBF_FOLIOS;
 
 	/* Assure zeroed buffer for non-read cases. */
 	if (!(flags & XBF_READ))
@@ -387,9 +387,9 @@  xfs_buf_alloc_pages(
 	for (;;) {
 		long	last = filled;
 
-		filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
-						bp->b_pages);
-		if (filled == bp->b_page_count) {
+		filled = alloc_pages_bulk_array(gfp_mask, bp->b_folio_count,
+						(struct page **)bp->b_folios);
+		if (filled == bp->b_folio_count) {
 			XFS_STATS_INC(bp->b_mount, xb_page_found);
 			break;
 		}
@@ -398,7 +398,7 @@  xfs_buf_alloc_pages(
 			continue;
 
 		if (flags & XBF_READ_AHEAD) {
-			xfs_buf_free_pages(bp);
+			xfs_buf_free_folios(bp);
 			return -ENOMEM;
 		}
 
@@ -412,14 +412,14 @@  xfs_buf_alloc_pages(
  *	Map buffer into kernel address-space if necessary.
  */
 STATIC int
-_xfs_buf_map_pages(
+_xfs_buf_map_folios(
 	struct xfs_buf		*bp,
 	xfs_buf_flags_t		flags)
 {
-	ASSERT(bp->b_flags & _XBF_PAGES);
-	if (bp->b_page_count == 1) {
+	ASSERT(bp->b_flags & _XBF_FOLIOS);
+	if (bp->b_folio_count == 1) {
 		/* A single page buffer is always mappable */
-		bp->b_addr = page_address(bp->b_pages[0]);
+		bp->b_addr = folio_address(bp->b_folios[0]);
 	} else if (flags & XBF_UNMAPPED) {
 		bp->b_addr = NULL;
 	} else {
@@ -443,8 +443,8 @@  _xfs_buf_map_pages(
 		 */
 		nofs_flag = memalloc_nofs_save();
 		do {
-			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-						-1);
+			bp->b_addr = vm_map_ram((struct page **)bp->b_folios,
+					bp->b_folio_count, -1);
 			if (bp->b_addr)
 				break;
 			vm_unmap_aliases();
@@ -571,7 +571,7 @@  xfs_buf_find_lock(
 			return -ENOENT;
 		}
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_flags &= _XBF_KMEM | _XBF_FOLIOS;
 		bp->b_ops = NULL;
 	}
 	return 0;
@@ -629,14 +629,15 @@  xfs_buf_find_insert(
 		goto out_drop_pag;
 
 	/*
-	 * For buffers that fit entirely within a single page, first attempt to
-	 * allocate the memory from the heap to minimise memory usage. If we
-	 * can't get heap memory for these small buffers, we fall back to using
-	 * the page allocator.
+	 * For buffers that fit entirely within a single page folio, first
+	 * attempt to allocate the memory from the heap to minimise memory
+	 * usage. If we can't get heap memory for these small buffers, we fall
+	 * back to using the page allocator.
 	 */
+
 	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
 	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
-		error = xfs_buf_alloc_pages(new_bp, flags);
+		error = xfs_buf_alloc_folios(new_bp, flags);
 		if (error)
 			goto out_free_buf;
 	}
@@ -728,11 +729,11 @@  xfs_buf_get_map(
 
 	/* We do not hold a perag reference anymore. */
 	if (!bp->b_addr) {
-		error = _xfs_buf_map_pages(bp, flags);
+		error = _xfs_buf_map_folios(bp, flags);
 		if (unlikely(error)) {
 			xfs_warn_ratelimited(btp->bt_mount,
-				"%s: failed to map %u pages", __func__,
-				bp->b_page_count);
+				"%s: failed to map %u folios", __func__,
+				bp->b_folio_count);
 			xfs_buf_relse(bp);
 			return error;
 		}
@@ -963,14 +964,14 @@  xfs_buf_get_uncached(
 	if (error)
 		return error;
 
-	error = xfs_buf_alloc_pages(bp, flags);
+	error = xfs_buf_alloc_folios(bp, flags);
 	if (error)
 		goto fail_free_buf;
 
-	error = _xfs_buf_map_pages(bp, 0);
+	error = _xfs_buf_map_folios(bp, 0);
 	if (unlikely(error)) {
 		xfs_warn(target->bt_mount,
-			"%s: failed to map pages", __func__);
+			"%s: failed to map folios", __func__);
 		goto fail_free_buf;
 	}
 
@@ -1465,7 +1466,7 @@  xfs_buf_ioapply_map(
 	blk_opf_t	op)
 {
 	int		page_index;
-	unsigned int	total_nr_pages = bp->b_page_count;
+	unsigned int	total_nr_pages = bp->b_folio_count;
 	int		nr_pages;
 	struct bio	*bio;
 	sector_t	sector =  bp->b_maps[map].bm_bn;
@@ -1503,7 +1504,7 @@  xfs_buf_ioapply_map(
 		if (nbytes > size)
 			nbytes = size;
 
-		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
+		rbytes = bio_add_folio(bio, bp->b_folios[page_index], nbytes,
 				      offset);
 		if (rbytes < nbytes)
 			break;
@@ -1716,13 +1717,13 @@  xfs_buf_offset(
 	struct xfs_buf		*bp,
 	size_t			offset)
 {
-	struct page		*page;
+	struct folio		*folio;
 
 	if (bp->b_addr)
 		return bp->b_addr + offset;
 
-	page = bp->b_pages[offset >> PAGE_SHIFT];
-	return page_address(page) + (offset & (PAGE_SIZE-1));
+	folio = bp->b_folios[offset >> PAGE_SHIFT];
+	return folio_address(folio) + (offset & (PAGE_SIZE-1));
 }
 
 void
@@ -1735,18 +1736,18 @@  xfs_buf_zero(
 
 	bend = boff + bsize;
 	while (boff < bend) {
-		struct page	*page;
+		struct folio	*folio;
 		int		page_index, page_offset, csize;
 
 		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
 		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
-		page = bp->b_pages[page_index];
+		folio = bp->b_folios[page_index];
 		csize = min_t(size_t, PAGE_SIZE - page_offset,
 				      BBTOB(bp->b_length) - boff);
 
 		ASSERT((csize + page_offset) <= PAGE_SIZE);
 
-		memset(page_address(page) + page_offset, 0, csize);
+		memset(folio_address(folio) + page_offset, 0, csize);
 
 		boff += csize;
 	}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b470de08a46c..1e7298ff3fa5 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -29,7 +29,7 @@  struct xfs_buf;
 #define XBF_READ_AHEAD	 (1u << 2) /* asynchronous read-ahead */
 #define XBF_NO_IOACCT	 (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
 #define XBF_ASYNC	 (1u << 4) /* initiator will not wait for completion */
-#define XBF_DONE	 (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_DONE	 (1u << 5) /* all folios in the buffer uptodate */
 #define XBF_STALE	 (1u << 6) /* buffer has been staled, do not find it */
 #define XBF_WRITE_FAIL	 (1u << 7) /* async writes have failed on this buffer */
 
@@ -39,7 +39,7 @@  struct xfs_buf;
 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
 
 /* flags used only internally */
-#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
+#define _XBF_FOLIOS	 (1u << 20)/* backed by refcounted folios */
 #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
 
@@ -68,7 +68,7 @@  typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_INODES,		"INODES" }, \
 	{ _XBF_DQUOTS,		"DQUOTS" }, \
 	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
-	{ _XBF_PAGES,		"PAGES" }, \
+	{ _XBF_FOLIOS,		"FOLIOS" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
 	/* The following interface flags should never be set */ \
@@ -116,7 +116,7 @@  typedef struct xfs_buftarg {
 	struct ratelimit_state	bt_ioerror_rl;
 } xfs_buftarg_t;
 
-#define XB_PAGES	2
+#define XB_FOLIOS	2
 
 struct xfs_buf_map {
 	xfs_daddr_t		bm_bn;	/* block number for I/O */
@@ -180,14 +180,14 @@  struct xfs_buf {
 	struct xfs_buf_log_item	*b_log_item;
 	struct list_head	b_li_list;	/* Log items list head */
 	struct xfs_trans	*b_transp;
-	struct page		**b_pages;	/* array of page pointers */
-	struct page		*b_page_array[XB_PAGES]; /* inline pages */
+	struct folio		**b_folios;	/* array of folio pointers */
+	struct folio		*b_folio_array[XB_FOLIOS]; /* inline folios */
 	struct xfs_buf_map	*b_maps;	/* compound buffer map */
 	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
 	int			b_map_count;
 	atomic_t		b_pin_count;	/* pin count */
 	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
-	unsigned int		b_page_count;	/* size of page array */
+	unsigned int		b_folio_count;	/* size of folio array */
 	unsigned int		b_offset;	/* page offset of b_addr,
 						   only for _XBF_KMEM buffers */
 	int			b_error;	/* error code on I/O */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 83a81cb52d8e..d1407cee48d9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -69,7 +69,7 @@  xfs_buf_item_straddle(
 {
 	void			*first, *last;
 
-	if (bp->b_page_count == 1 || !(bp->b_flags & XBF_UNMAPPED))
+	if (bp->b_folio_count == 1 || !(bp->b_flags & XBF_UNMAPPED))
 		return false;
 
 	first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index caccb7f76690..804389b8e802 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -279,4 +279,12 @@  kmem_to_page(void *addr)
 	return virt_to_page(addr);
 }
 
+static inline struct folio *
+kmem_to_folio(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return page_folio(vmalloc_to_page(addr));
+	return virt_to_folio(addr);
+}
+
 #endif /* __XFS_LINUX__ */