diff mbox series

[1/2] filemap: Convert generic_perform_write() to support large folios

Message ID 20240527163616.1135968-2-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/2] filemap: Convert generic_perform_write() to support large folios | expand

Commit Message

Christoph Hellwig May 27, 2024, 4:36 p.m. UTC
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>

Modelled after the loop in iomap_write_iter(), copy larger chunks from
userspace if the filesystem has created large folios.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
[hch: use mapping_max_folio_size to keep supporting file systems that do
 not support large folios]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 mm/filemap.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

Comments

Matthew Wilcox (Oracle) May 27, 2024, 6:17 p.m. UTC | #1
On Mon, May 27, 2024 at 06:36:08PM +0200, Christoph Hellwig wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> Modelled after the loop in iomap_write_iter(), copy larger chunks from
> userspace if the filesystem has created large folios.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> [hch: use mapping_max_folio_size to keep supporting file systems that do
>  not support large folios]
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Yup, this still makes sense to me.

Could you remind me why we need to call flush_dcache_folio() in
generic_perform_write() while we don't in iomap_write_iter()?

>  		if (mapping_writably_mapped(mapping))
> -			flush_dcache_page(page);
> +			flush_dcache_folio(folio);

(i'm not talking about this one)

> -		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
> -		flush_dcache_page(page);
> +		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> +		flush_dcache_folio(folio);

(this one has no equivalent in iomap)

>  		status = a_ops->write_end(file, mapping, pos, bytes, copied,
>  						page, fsdata);
Christoph Hellwig May 28, 2024, 8:12 a.m. UTC | #2
On Mon, May 27, 2024 at 07:17:18PM +0100, Matthew Wilcox wrote:
> Could you remind me why we need to call flush_dcache_folio() in
> generic_perform_write() while we don't in iomap_write_iter()?

> > -		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
> > -		flush_dcache_page(page);
> > +		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> > +		flush_dcache_folio(folio);
> 
> (this one has no equivalent in iomap)

The iomap equivalent is in __iomap_write_end and iomap_write_end_inline
and block_write_end.
Daniel Gomez May 28, 2024, 3:23 p.m. UTC | #3
Hi Christoph, Matthew,
On Mon, May 27, 2024 at 06:36:08PM +0200, Christoph Hellwig wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> Modelled after the loop in iomap_write_iter(), copy larger chunks from
> userspace if the filesystem has created large folios.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> [hch: use mapping_max_folio_size to keep supporting file systems that do
>  not support large folios]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  mm/filemap.c | 40 +++++++++++++++++++++++++---------------
>  1 file changed, 25 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 382c3d06bfb10c..860728e26ccf32 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3981,21 +3981,24 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>  	loff_t pos = iocb->ki_pos;
>  	struct address_space *mapping = file->f_mapping;
>  	const struct address_space_operations *a_ops = mapping->a_ops;
> +	size_t chunk = mapping_max_folio_size(mapping);
>  	long status = 0;
>  	ssize_t written = 0;
>  
>  	do {
>  		struct page *page;
> -		unsigned long offset;	/* Offset into pagecache page */
> -		unsigned long bytes;	/* Bytes to write to page */
> +		struct folio *folio;
> +		size_t offset;		/* Offset into folio */
> +		size_t bytes;		/* Bytes to write to folio */
>  		size_t copied;		/* Bytes copied from user */
>  		void *fsdata = NULL;
>  
> -		offset = (pos & (PAGE_SIZE - 1));
> -		bytes = min_t(unsigned long, PAGE_SIZE - offset,
> -						iov_iter_count(i));
> +		bytes = iov_iter_count(i);
> +retry:
> +		offset = pos & (chunk - 1);
> +		bytes = min(chunk - offset, bytes);
> +		balance_dirty_pages_ratelimited(mapping);
>  
> -again:
>  		/*
>  		 * Bring in the user page that we will copy from _first_.
>  		 * Otherwise there's a nasty deadlock on copying from the
> @@ -4017,11 +4020,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>  		if (unlikely(status < 0))
>  			break;
>  
> +		folio = page_folio(page);
> +		offset = offset_in_folio(folio, pos);
> +		if (bytes > folio_size(folio) - offset)
> +			bytes = folio_size(folio) - offset;
> +
>  		if (mapping_writably_mapped(mapping))
> -			flush_dcache_page(page);
> +			flush_dcache_folio(folio);
>  
> -		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
> -		flush_dcache_page(page);
> +		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> +		flush_dcache_folio(folio);
>  
>  		status = a_ops->write_end(file, mapping, pos, bytes, copied,
>  						page, fsdata);

I have the same patch for shmem and large folios tree. That was the last piece
needed for getting better performance results. However, it is also needed to
support folios in the write_begin() and write_end() callbacks. In order to avoid
making them local to shmem, how should we do the transition to folios in these
2 callbacks? I was looking into aops->read_folio approach but what do you think?

> @@ -4039,14 +4047,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>  			 * halfway through, might be a race with munmap,
>  			 * might be severe memory pressure.
>  			 */
> -			if (copied)
> +			if (chunk > PAGE_SIZE)
> +				chunk /= 2;
> +			if (copied) {
>  				bytes = copied;
> -			goto again;
> +				goto retry;
> +			}
> +		} else {
> +			pos += status;
> +			written += status;
>  		}
> -		pos += status;
> -		written += status;
> -
> -		balance_dirty_pages_ratelimited(mapping);
>  	} while (iov_iter_count(i));
>  
>  	if (!written)
> -- 
> 2.43.0
> 

Daniel
Matthew Wilcox (Oracle) May 28, 2024, 4:50 p.m. UTC | #4
On Tue, May 28, 2024 at 03:23:39PM +0000, Daniel Gomez wrote:
> I have the same patch for shmem and large folios tree. That was the last piece
> needed for getting better performance results. However, it is also needed to
> support folios in the write_begin() and write_end() callbacks.

I don't think it's *needed*.  It's nice!  But clearly not necessary
since Christoph made nfs work without doing that.

> In order to avoid
> making them local to shmem, how should we do the transition to folios in these
> 2 callbacks? I was looking into aops->read_folio approach but what do you think?

See the v2 of buffer_write_operations that I just posted.  I was waiting
for feedback from Christoph on the revised method for passing fsdata
around, but I may as well just post a v2 and see what happens.
Daniel Gomez May 28, 2024, 7:01 p.m. UTC | #5
On Tue, May 28, 2024 at 05:50:44PM +0100, Matthew Wilcox wrote:
> On Tue, May 28, 2024 at 03:23:39PM +0000, Daniel Gomez wrote:
> > I have the same patch for shmem and large folios tree. That was the last piece
> > needed for getting better performance results. However, it is also needed to
> > support folios in the write_begin() and write_end() callbacks.
> 
> I don't think it's *needed*.  It's nice!  But clearly not necessary
> since Christoph made nfs work without doing that.

I see. We send anyway the length with bytes and the folio allocated inside
write_begin() is retrieved with folio_page().

I did test this patch (+mapping_max_folio_size() patch) for shmem an it works fine for me.
> 
> > In order to avoid
> > making them local to shmem, how should we do the transition to folios in these
> > 2 callbacks? I was looking into aops->read_folio approach but what do you think?
> 
> See the v2 of buffer_write_operations that I just posted.  I was waiting
> for feedback from Christoph on the revised method for passing fsdata
> around, but I may as well just post a v2 and see what happens.

Interesting. I think it makes sense to convert tmpfs to
buffered_write_operations as well. Can you add me to the v2 so I can add/review
it for tmpfs?

Thanks
Shaun Tancheff June 11, 2024, 10:47 a.m. UTC | #6
On 5/27/24 23:36, Christoph Hellwig wrote:

> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> Modelled after the loop in iomap_write_iter(), copy larger chunks from
> userspace if the filesystem has created large folios.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> [hch: use mapping_max_folio_size to keep supporting file systems that do
>   not support large folios]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   mm/filemap.c | 40 +++++++++++++++++++++++++---------------
>   1 file changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 382c3d06bfb10c..860728e26ccf32 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3981,21 +3981,24 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>   	loff_t pos = iocb->ki_pos;
>   	struct address_space *mapping = file->f_mapping;
>   	const struct address_space_operations *a_ops = mapping->a_ops;
> +	size_t chunk = mapping_max_folio_size(mapping);

Better to default chunk to PAGE_SIZE for backward compat
+       size_t chunk = PAGE_SIZE;

>   	long status = 0;
>   	ssize_t written = 0;
>   

Have fs opt in to large folio support:

+       if (mapping_large_folio_support(mapping))
+               chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;

>   	do {
>   		struct page *page;
> -		unsigned long offset;	/* Offset into pagecache page */
> -		unsigned long bytes;	/* Bytes to write to page */
> +		struct folio *folio;
> +		size_t offset;		/* Offset into folio */
> +		size_t bytes;		/* Bytes to write to folio */
>   		size_t copied;		/* Bytes copied from user */
>   		void *fsdata = NULL;
>   
> -		offset = (pos & (PAGE_SIZE - 1));
> -		bytes = min_t(unsigned long, PAGE_SIZE - offset,
> -						iov_iter_count(i));
> +		bytes = iov_iter_count(i);
> +retry:
> +		offset = pos & (chunk - 1);
> +		bytes = min(chunk - offset, bytes);
> +		balance_dirty_pages_ratelimited(mapping);
>   
> -again:
>   		/*
>   		 * Bring in the user page that we will copy from _first_.
>   		 * Otherwise there's a nasty deadlock on copying from the
> @@ -4017,11 +4020,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>   		if (unlikely(status < 0))
>   			break;
>   
> +		folio = page_folio(page);
> +		offset = offset_in_folio(folio, pos);
> +		if (bytes > folio_size(folio) - offset)
> +			bytes = folio_size(folio) - offset;
> +
>   		if (mapping_writably_mapped(mapping))
> -			flush_dcache_page(page);
> +			flush_dcache_folio(folio);
>   
> -		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
> -		flush_dcache_page(page);
> +		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> +		flush_dcache_folio(folio);
>   
>   		status = a_ops->write_end(file, mapping, pos, bytes, copied,
>   						page, fsdata);
> @@ -4039,14 +4047,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>   			 * halfway through, might be a race with munmap,
>   			 * might be severe memory pressure.
>   			 */
> -			if (copied)
> +			if (chunk > PAGE_SIZE)
> +				chunk /= 2;
> +			if (copied) {
>   				bytes = copied;
> -			goto again;
> +				goto retry;
> +			}
> +		} else {
> +			pos += status;
> +			written += status;
>   		}
> -		pos += status;
> -		written += status;
> -
> -		balance_dirty_pages_ratelimited(mapping);
>   	} while (iov_iter_count(i));
>   
>   	if (!written)

Tested with Lustre with large folios and kernel 6.6 with this patch (and suggested changes).

Tested-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Christoph Hellwig June 11, 2024, 4:13 p.m. UTC | #7
On Tue, Jun 11, 2024 at 05:47:12PM +0700, Shaun Tancheff wrote:
>>   	const struct address_space_operations *a_ops = mapping->a_ops;
>> +	size_t chunk = mapping_max_folio_size(mapping);
>
> Better to default chunk to PAGE_SIZE for backward compat
> +       size_t chunk = PAGE_SIZE;
>
>>   	long status = 0;
>>   	ssize_t written = 0;
>>   
>
> Have fs opt in to large folio support:
>
> +       if (mapping_large_folio_support(mapping))
> +               chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;

I don't think you've actually read the code, have you?
Shaun Tancheff June 12, 2024, 1:41 a.m. UTC | #8
On 6/11/24 23:13, Christoph Hellwig wrote:

> On Tue, Jun 11, 2024 at 05:47:12PM +0700, Shaun Tancheff wrote:
>>>    	const struct address_space_operations *a_ops = mapping->a_ops;
>>> +	size_t chunk = mapping_max_folio_size(mapping);
>> Better to default chunk to PAGE_SIZE for backward compat
>> +       size_t chunk = PAGE_SIZE;
>>
>>>    	long status = 0;
>>>    	ssize_t written = 0;
>>>    
>> Have fs opt in to large folio support:
>>
>> +       if (mapping_large_folio_support(mapping))
>> +               chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
> I don't think you've actually read the code, have you?

I checked from 6.6 to linux-next with this patch and my ext4 VM does not boot without the opt-in.

Almost certainly there is something I am missing, probably not looking at the correct tree.

Thanks!
--Shaun
Christoph Hellwig June 12, 2024, 4:02 a.m. UTC | #9
On Wed, Jun 12, 2024 at 08:41:01AM +0700, Shaun Tancheff wrote:
> On 6/11/24 23:13, Christoph Hellwig wrote:
>
>> On Tue, Jun 11, 2024 at 05:47:12PM +0700, Shaun Tancheff wrote:
>>>>    	const struct address_space_operations *a_ops = mapping->a_ops;
>>>> +	size_t chunk = mapping_max_folio_size(mapping);
>>> Better to default chunk to PAGE_SIZE for backward compat
>>> +       size_t chunk = PAGE_SIZE;
>>>
>>>>    	long status = 0;
>>>>    	ssize_t written = 0;
>>>>    
>>> Have fs opt in to large folio support:
>>>
>>> +       if (mapping_large_folio_support(mapping))
>>> +               chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
>> I don't think you've actually read the code, have you?
>
> I checked from 6.6 to linux-next with this patch and my ext4 VM does not boot without the opt-in.

Please take a look at the definition of mapping_max_folio_size,
which is called above in the quoted patch.
diff mbox series

Patch

diff --git a/mm/filemap.c b/mm/filemap.c
index 382c3d06bfb10c..860728e26ccf32 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3981,21 +3981,24 @@  ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
 	loff_t pos = iocb->ki_pos;
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
+	size_t chunk = mapping_max_folio_size(mapping);
 	long status = 0;
 	ssize_t written = 0;
 
 	do {
 		struct page *page;
-		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long bytes;	/* Bytes to write to page */
+		struct folio *folio;
+		size_t offset;		/* Offset into folio */
+		size_t bytes;		/* Bytes to write to folio */
 		size_t copied;		/* Bytes copied from user */
 		void *fsdata = NULL;
 
-		offset = (pos & (PAGE_SIZE - 1));
-		bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_count(i));
+		bytes = iov_iter_count(i);
+retry:
+		offset = pos & (chunk - 1);
+		bytes = min(chunk - offset, bytes);
+		balance_dirty_pages_ratelimited(mapping);
 
-again:
 		/*
 		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
@@ -4017,11 +4020,16 @@  ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
 		if (unlikely(status < 0))
 			break;
 
+		folio = page_folio(page);
+		offset = offset_in_folio(folio, pos);
+		if (bytes > folio_size(folio) - offset)
+			bytes = folio_size(folio) - offset;
+
 		if (mapping_writably_mapped(mapping))
-			flush_dcache_page(page);
+			flush_dcache_folio(folio);
 
-		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
-		flush_dcache_page(page);
+		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
+		flush_dcache_folio(folio);
 
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
 						page, fsdata);
@@ -4039,14 +4047,16 @@  ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
 			 * halfway through, might be a race with munmap,
 			 * might be severe memory pressure.
 			 */
-			if (copied)
+			if (chunk > PAGE_SIZE)
+				chunk /= 2;
+			if (copied) {
 				bytes = copied;
-			goto again;
+				goto retry;
+			}
+		} else {
+			pos += status;
+			written += status;
 		}
-		pos += status;
-		written += status;
-
-		balance_dirty_pages_ratelimited(mapping);
 	} while (iov_iter_count(i));
 
 	if (!written)