diff mbox series

nfs: add 'noextend' option for lock-less 'lost writes' prevention

Message ID 20240618153313.3167460-1-dan.aloni@vastdata.com (mailing list archive)
State New
Headers show
Series nfs: add 'noextend' option for lock-less 'lost writes' prevention | expand

Commit Message

Dan Aloni June 18, 2024, 3:33 p.m. UTC
There are some applications that write to predefined non-overlapping
file offsets from multiple clients and therefore don't need to rely on
file locking. However, NFS file system behavior of extending writes to
to deal with write fragmentation, causes those clients to corrupt each
other's data.

To help these applications, this change adds the `noextend` parameter to
the mount options, and handles this case in `nfs_can_extend_write`.

Clients can additionally add the 'noac' option to ensure page cache
flush on read for modified files.

Signed-off-by: Dan Aloni <dan.aloni@vastdata.com>
---
 fs/nfs/fs_context.c       | 8 ++++++++
 fs/nfs/super.c            | 3 +++
 fs/nfs/write.c            | 3 +++
 include/linux/nfs_fs_sb.h | 1 +
 4 files changed, 15 insertions(+)

Comments

Trond Myklebust June 18, 2024, 6:59 p.m. UTC | #1
Hi Dan,

On Tue, 2024-06-18 at 18:33 +0300, Dan Aloni wrote:
> There are some applications that write to predefined non-overlapping
> file offsets from multiple clients and therefore don't need to rely
> on
> file locking. However, NFS file system behavior of extending writes
> to
> to deal with write fragmentation, causes those clients to corrupt
> each
> other's data.
> 
> To help these applications, this change adds the `noextend` parameter
> to
> the mount options, and handles this case in `nfs_can_extend_write`.
> 
> Clients can additionally add the 'noac' option to ensure page cache
> flush on read for modified files.

I'm not overly enamoured of the name "noextend". To me that sounds like
it might have something to do with preventing appends. Can we find
something that is a bit more descriptive?

That said, and given your last comment about reads. Wouldn't it be
better to have the application use O_DIRECT for these workloads? 
Turning off attribute caching is both racy and an inefficient way to
manage page cache consistency. It forces the client to bombard the
server with GETATTR requests in order to check that the page cache is
in synch, whereas your description of the workload appears to suggest
that the correct assumption should be that it is not in synch.

IOW: I'm asking if the better solution might not be to rather implement
something akin to Solaris' "forcedirectio"?

> 
> Signed-off-by: Dan Aloni <dan.aloni@vastdata.com>
> ---
>  fs/nfs/fs_context.c       | 8 ++++++++
>  fs/nfs/super.c            | 3 +++
>  fs/nfs/write.c            | 3 +++
>  include/linux/nfs_fs_sb.h | 1 +
>  4 files changed, 15 insertions(+)
> 
> diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
> index 6c9f3f6645dd..509718bc5b24 100644
> --- a/fs/nfs/fs_context.c
> +++ b/fs/nfs/fs_context.c
> @@ -49,6 +49,7 @@ enum nfs_param {
>  	Opt_bsize,
>  	Opt_clientaddr,
>  	Opt_cto,
> +	Opt_extend,
>  	Opt_fg,
>  	Opt_fscache,
>  	Opt_fscache_flag,
> @@ -149,6 +150,7 @@ static const struct fs_parameter_spec
> nfs_fs_parameters[] = {
>  	fsparam_u32   ("bsize",		Opt_bsize),
>  	fsparam_string("clientaddr",	Opt_clientaddr),
>  	fsparam_flag_no("cto",		Opt_cto),
> +	fsparam_flag_no("extend",	Opt_extend),
>  	fsparam_flag  ("fg",		Opt_fg),
>  	fsparam_flag_no("fsc",		Opt_fscache_flag),
>  	fsparam_string("fsc",		Opt_fscache),
> @@ -592,6 +594,12 @@ static int nfs_fs_context_parse_param(struct
> fs_context *fc,
>  		else
>  			ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY;
>  		break;
> +	case Opt_extend:
> +		if (result.negated)
> +			ctx->flags |= NFS_MOUNT_NO_EXTEND;
> +		else
> +			ctx->flags &= ~NFS_MOUNT_NO_EXTEND;
> +		break;
>  	case Opt_ac:
>  		if (result.negated)
>  			ctx->flags |= NFS_MOUNT_NOAC;
> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
> index cbbd4866b0b7..f27fd3858913 100644
> --- a/fs/nfs/super.c
> +++ b/fs/nfs/super.c
> @@ -549,6 +549,9 @@ static void nfs_show_mount_options(struct
> seq_file *m, struct nfs_server *nfss,
>  	else
>  		seq_puts(m, ",local_lock=posix");
>  
> +	if (nfss->flags & NFS_MOUNT_NO_EXTEND)
> +		seq_puts(m, ",noextend");
> +
>  	if (nfss->flags & NFS_MOUNT_WRITE_EAGER) {
>  		if (nfss->flags & NFS_MOUNT_WRITE_WAIT)
>  			seq_puts(m, ",write=wait");
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 2329cbb0e446..ed76c317b349 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -1315,7 +1315,10 @@ static int nfs_can_extend_write(struct file
> *file, struct folio *folio,
>  	struct file_lock_context *flctx =
> locks_inode_context(inode);
>  	struct file_lock *fl;
>  	int ret;
> +	unsigned int mntflags = NFS_SERVER(inode)->flags;
>  
> +	if (mntflags & NFS_MOUNT_NO_EXTEND)
> +		return 0;
>  	if (file->f_flags & O_DSYNC)
>  		return 0;
>  	if (!nfs_folio_write_uptodate(folio, pagelen))
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 92de074e63b9..f6d8a4f63e50 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -157,6 +157,7 @@ struct nfs_server {
>  #define NFS_MOUNT_WRITE_WAIT		0x02000000
>  #define NFS_MOUNT_TRUNK_DISCOVERY	0x04000000
>  #define NFS_MOUNT_SHUTDOWN			0x08000000
> +#define NFS_MOUNT_NO_EXTEND		0x10000000
>  
>  	unsigned int		fattr_valid;	/* Valid attributes
> */
>  	unsigned int		caps;		/* server
> capabilities */
Christoph Hellwig June 19, 2024, 5:44 a.m. UTC | #2
On Tue, Jun 18, 2024 at 06:33:13PM +0300, Dan Aloni wrote:
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -1315,7 +1315,10 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio,
>  	struct file_lock_context *flctx = locks_inode_context(inode);
>  	struct file_lock *fl;
>  	int ret;
> +	unsigned int mntflags = NFS_SERVER(inode)->flags;
>  
> +	if (mntflags & NFS_MOUNT_NO_EXTEND)
> +		return 0;
>  	if (file->f_flags & O_DSYNC)
>  		return 0;
>  	if (!nfs_folio_write_uptodate(folio, pagelen))

I find the logic in nfs_update_folio to extend the write to the entire
folio rather weird, and especially bad with the larger folio support I
just added.

It makes the client write more (and with large page sizes or large
folios) potentially a lot more than what the application asked for.

The comment above nfs_can_extend_write suggest it is done to avoid
"fragmentation".  My immediate reaction assumed that would be about file
system fragmentation, which seems odd given that I'd expect servers to
either log data, in which case this just increases write amplification
for no good reason, or use something like the Linux page cache in which
case it would be entirely pointless.

But when following git blame over a few rounds of fixes (that all narrow
down the scope of this optimization because it caused problems) the
"fragmentation" eventually becomes:

	/* If we're not using byte range locks, and we know the page
	 * is entirely in cache, it may be more efficient to avoid
	 * fragmenting write requests.
	 */

Which to me suggests it is about struct nfs_page and the on-the-wire
RPCs.  In which case the merging in nfs_try_to_update_request that
merges consecutive I/O should take care of all the interesting cases.

In other words:  I strongly suspect everyone is better off if this
extending write behavior is removed or at least not the default.
Sagi Grimberg June 19, 2024, 8:37 a.m. UTC | #3
On 18/06/2024 21:59, Trond Myklebust wrote:
> Hi Dan,
>
> On Tue, 2024-06-18 at 18:33 +0300, Dan Aloni wrote:
>> There are some applications that write to predefined non-overlapping
>> file offsets from multiple clients and therefore don't need to rely
>> on
>> file locking. However, NFS file system behavior of extending writes
>> to
>> to deal with write fragmentation, causes those clients to corrupt
>> each
>> other's data.
>>
>> To help these applications, this change adds the `noextend` parameter
>> to
>> the mount options, and handles this case in `nfs_can_extend_write`.
>>
>> Clients can additionally add the 'noac' option to ensure page cache
>> flush on read for modified files.
> I'm not overly enamoured of the name "noextend". To me that sounds like
> it might have something to do with preventing appends. Can we find
> something that is a bit more descriptive?

nopbw (No page boundary writes) ?

>
> That said, and given your last comment about reads. Wouldn't it be
> better to have the application use O_DIRECT for these workloads?
> Turning off attribute caching is both racy and an inefficient way to
> manage page cache consistency. It forces the client to bombard the
> server with GETATTR requests in order to check that the page cache is
> in synch, whereas your description of the workload appears to suggest
> that the correct assumption should be that it is not in synch.
>
> IOW: I'm asking if the better solution might not be to rather implement
> something akin to Solaris' "forcedirectio"?

This access pattern represents a common case in HPC where different workers
write records to a shared output file which do not necessarily align to 
a page boundary.

This is not everything that the app is doing nor the only file it is 
accessing, so IMO forcing
directio universally is may penalize the application.
Trond Myklebust June 19, 2024, 1:33 p.m. UTC | #4
On Tue, 2024-06-18 at 22:44 -0700, Christoph Hellwig wrote:
> On Tue, Jun 18, 2024 at 06:33:13PM +0300, Dan Aloni wrote:
> > --- a/fs/nfs/write.c
> > +++ b/fs/nfs/write.c
> > @@ -1315,7 +1315,10 @@ static int nfs_can_extend_write(struct file
> > *file, struct folio *folio,
> >  	struct file_lock_context *flctx =
> > locks_inode_context(inode);
> >  	struct file_lock *fl;
> >  	int ret;
> > +	unsigned int mntflags = NFS_SERVER(inode)->flags;
> >  
> > +	if (mntflags & NFS_MOUNT_NO_EXTEND)
> > +		return 0;
> >  	if (file->f_flags & O_DSYNC)
> >  		return 0;
> >  	if (!nfs_folio_write_uptodate(folio, pagelen))
> 
> I find the logic in nfs_update_folio to extend the write to the
> entire
> folio rather weird, and especially bad with the larger folio support
> I
> just added.
> 
> It makes the client write more (and with large page sizes or large
> folios) potentially a lot more than what the application asked for.
> 
> The comment above nfs_can_extend_write suggest it is done to avoid
> "fragmentation".  My immediate reaction assumed that would be about
> file
> system fragmentation, which seems odd given that I'd expect servers
> to
> either log data, in which case this just increases write
> amplification
> for no good reason, or use something like the Linux page cache in
> which
> case it would be entirely pointless.

If you have a workload that does something like a 10 byte write, then
leaves a  hole of 20 bytes, then another 10 byte write, ... then that
workload will produce a train of 10 byte write RPC calls. That ends up
being incredibly slow for obvious reasons: you are forcing the server
to process a load of 10 byte long RPC calls, all of which are
contending for the inode lock for the same file.

If the client knows that the holes are just that, or it knows the data
that was previously written in that area (because the folio is up to
date) then it can consolidate all those 10 bytes writes into 1MB write.
So we end up compressing ~35000 RPC calls into one. Why is that not a
good thing?

> 
> But when following git blame over a few rounds of fixes (that all
> narrow
> down the scope of this optimization because it caused problems) the
> "fragmentation" eventually becomes:
> 
> 	/* If we're not using byte range locks, and we know the page
> 	 * is entirely in cache, it may be more efficient to avoid
> 	 * fragmenting write requests.
> 	 */
> 
> Which to me suggests it is about struct nfs_page and the on-the-wire
> RPCs.  In which case the merging in nfs_try_to_update_request that
> merges consecutive I/O should take care of all the interesting cases.
> 
> In other words:  I strongly suspect everyone is better off if this
> extending write behavior is removed or at least not the default.
> 

The merging in nfs_try_to_update_request() is not sufficient to fix
pathologies like the above example, no.
Trond Myklebust June 19, 2024, 2:15 p.m. UTC | #5
On Wed, 2024-06-19 at 11:37 +0300, Sagi Grimberg wrote:
> 
> 
> On 18/06/2024 21:59, Trond Myklebust wrote:
> > Hi Dan,
> > 
> > On Tue, 2024-06-18 at 18:33 +0300, Dan Aloni wrote:
> > > There are some applications that write to predefined non-
> > > overlapping
> > > file offsets from multiple clients and therefore don't need to
> > > rely
> > > on
> > > file locking. However, NFS file system behavior of extending
> > > writes
> > > to
> > > to deal with write fragmentation, causes those clients to corrupt
> > > each
> > > other's data.
> > > 
> > > To help these applications, this change adds the `noextend`
> > > parameter
> > > to
> > > the mount options, and handles this case in
> > > `nfs_can_extend_write`.
> > > 
> > > Clients can additionally add the 'noac' option to ensure page
> > > cache
> > > flush on read for modified files.
> > I'm not overly enamoured of the name "noextend". To me that sounds
> > like
> > it might have something to do with preventing appends. Can we find
> > something that is a bit more descriptive?
> 
> nopbw (No page boundary writes) ?
> 
> > 
> > That said, and given your last comment about reads. Wouldn't it be
> > better to have the application use O_DIRECT for these workloads?
> > Turning off attribute caching is both racy and an inefficient way
> > to
> > manage page cache consistency. It forces the client to bombard the
> > server with GETATTR requests in order to check that the page cache
> > is
> > in synch, whereas your description of the workload appears to
> > suggest
> > that the correct assumption should be that it is not in synch.
> > 
> > IOW: I'm asking if the better solution might not be to rather
> > implement
> > something akin to Solaris' "forcedirectio"?
> 
> This access pattern represents a common case in HPC where different
> workers
> write records to a shared output file which do not necessarily align
> to 
> a page boundary.
> 
> This is not everything that the app is doing nor the only file it is 
> accessing, so IMO forcing
> directio universally is may penalize the application.

Worse than forcing an attribute revalidation on every read?

BTW: We've been asked about the same issue from some of our customers,
and are planning on solving the problem by adding a new per-file
attribute to the NFSv4.2 protocol.

The detection of that NOCACHE attribute would cause the client to
automatically choose O_DIRECT on file open, overriding the default
buffered I/O model. So this would allow the user or sysadmin to specify
at file creation time that this file will be used for purposes that are
incompatible with caching.

If set on a directory, the same attribute would cause the client not to
cache the READDIR contents. This is useful when dealing with
directories where a Windows sysadmin may have set an Access Based
Enumeration property.
Sagi Grimberg June 19, 2024, 2:31 p.m. UTC | #6
On 19/06/2024 17:15, Trond Myklebust wrote:
> On Wed, 2024-06-19 at 11:37 +0300, Sagi Grimberg wrote:
>>
>> On 18/06/2024 21:59, Trond Myklebust wrote:
>>> Hi Dan,
>>>
>>> On Tue, 2024-06-18 at 18:33 +0300, Dan Aloni wrote:
>>>> There are some applications that write to predefined non-
>>>> overlapping
>>>> file offsets from multiple clients and therefore don't need to
>>>> rely
>>>> on
>>>> file locking. However, NFS file system behavior of extending
>>>> writes
>>>> to
>>>> to deal with write fragmentation, causes those clients to corrupt
>>>> each
>>>> other's data.
>>>>
>>>> To help these applications, this change adds the `noextend`
>>>> parameter
>>>> to
>>>> the mount options, and handles this case in
>>>> `nfs_can_extend_write`.
>>>>
>>>> Clients can additionally add the 'noac' option to ensure page
>>>> cache
>>>> flush on read for modified files.
>>> I'm not overly enamoured of the name "noextend". To me that sounds
>>> like
>>> it might have something to do with preventing appends. Can we find
>>> something that is a bit more descriptive?
>> nopbw (No page boundary writes) ?
>>
>>> That said, and given your last comment about reads. Wouldn't it be
>>> better to have the application use O_DIRECT for these workloads?
>>> Turning off attribute caching is both racy and an inefficient way
>>> to
>>> manage page cache consistency. It forces the client to bombard the
>>> server with GETATTR requests in order to check that the page cache
>>> is
>>> in synch, whereas your description of the workload appears to
>>> suggest
>>> that the correct assumption should be that it is not in synch.
>>>
>>> IOW: I'm asking if the better solution might not be to rather
>>> implement
>>> something akin to Solaris' "forcedirectio"?
>> This access pattern represents a common case in HPC where different
>> workers
>> write records to a shared output file which do not necessarily align
>> to
>> a page boundary.
>>
>> This is not everything that the app is doing nor the only file it is
>> accessing, so IMO forcing
>> directio universally is may penalize the application.
> Worse than forcing an attribute revalidation on every read?

For this use-case, yes. Different workloads may or may not be interested 
in reading
this file.

>
> BTW: We've been asked about the same issue from some of our customers,
> and are planning on solving the problem by adding a new per-file
> attribute to the NFSv4.2 protocol.

Interesting, I recently joined the ietf mailing list but have not seen 
discussion
this as of yet. Would be interested to learn more.

>
> The detection of that NOCACHE attribute would cause the client to
> automatically choose O_DIRECT on file open, overriding the default
> buffered I/O model. So this would allow the user or sysadmin to specify
> at file creation time that this file will be used for purposes that are
> incompatible with caching.

user/sysadmin as in not the client? setting this out-of-band?
That does not work where the application and the sysadmin do not know about
each other (i.e. in a cloud environment).

The use-case that is described here cannot be mandated by the server because
the file usage pattern is really driven by the application.
Trond Myklebust June 19, 2024, 6:03 p.m. UTC | #7
On Wed, 2024-06-19 at 09:32 -0400, Trond Myklebust wrote:
> On Tue, 2024-06-18 at 22:44 -0700, Christoph Hellwig wrote:
> > On Tue, Jun 18, 2024 at 06:33:13PM +0300, Dan Aloni wrote:
> > > --- a/fs/nfs/write.c
> > > +++ b/fs/nfs/write.c
> > > @@ -1315,7 +1315,10 @@ static int nfs_can_extend_write(struct
> > > file
> > > *file, struct folio *folio,
> > >  	struct file_lock_context *flctx =
> > > locks_inode_context(inode);
> > >  	struct file_lock *fl;
> > >  	int ret;
> > > +	unsigned int mntflags = NFS_SERVER(inode)->flags;
> > >  
> > > +	if (mntflags & NFS_MOUNT_NO_EXTEND)
> > > +		return 0;
> > >  	if (file->f_flags & O_DSYNC)
> > >  		return 0;
> > >  	if (!nfs_folio_write_uptodate(folio, pagelen))
> > 
> > I find the logic in nfs_update_folio to extend the write to the
> > entire
> > folio rather weird, and especially bad with the larger folio
> > support
> > I
> > just added.
> > 
> > It makes the client write more (and with large page sizes or large
> > folios) potentially a lot more than what the application asked for.
> > 
> > The comment above nfs_can_extend_write suggest it is done to avoid
> > "fragmentation".  My immediate reaction assumed that would be about
> > file
> > system fragmentation, which seems odd given that I'd expect servers
> > to
> > either log data, in which case this just increases write
> > amplification
> > for no good reason, or use something like the Linux page cache in
> > which
> > case it would be entirely pointless.
> 
> If you have a workload that does something like a 10 byte write, then
> leaves a  hole of 20 bytes, then another 10 byte write, ... then that
> workload will produce a train of 10 byte write RPC calls. That ends
> up
> being incredibly slow for obvious reasons: you are forcing the server
> to process a load of 10 byte long RPC calls, all of which are
> contending for the inode lock for the same file.
> 
> If the client knows that the holes are just that, or it knows the
> data
> that was previously written in that area (because the folio is up to
> date) then it can consolidate all those 10 bytes writes into 1MB
> write.
> So we end up compressing ~35000 RPC calls into one. Why is that not a
> good thing?
> 

BTW: this is not just a theoretical thing. Look at the way that glibc
handles a size-extending fallocate() on filesystems that don't have
native support, by writing a byte of information on every 4k boundary.
That's not quite as dramatic as my 10 byte example above, but it still
does reduce the number of required write RPC calls by a factor of 256.
Christoph Hellwig June 20, 2024, 5:13 a.m. UTC | #8
On Wed, Jun 19, 2024 at 06:03:02PM +0000, Trond Myklebust wrote:
> > So we end up compressing ~35000 RPC calls into one. Why is that not a
> > good thing?
> > 
> 
> BTW: this is not just a theoretical thing. Look at the way that glibc
> handles a size-extending fallocate() on filesystems that don't have
> native support, by writing a byte of information on every 4k boundary.
> That's not quite as dramatic as my 10 byte example above, but it still
> does reduce the number of required write RPC calls by a factor of 256.

That's a bit of a weird case to be honest, especially as it is an
invalid implementation of the fallocate semantics.  At the same time
this slows down perfectly normal log file workloads that just append
a few bytes in every call as each of them gets blown up to 4k.

Maybe we'll need a heuristic for servers that don't support ALLOCATE
to work around the broken glibc behavior, but in general blowing up
writes to include potentially huge amounts of data seems like a really
bad default.
Christoph Hellwig June 20, 2024, 5:26 a.m. UTC | #9
On Wed, Jun 19, 2024 at 02:15:16PM +0000, Trond Myklebust wrote:
> BTW: We've been asked about the same issue from some of our customers,
> and are planning on solving the problem by adding a new per-file
> attribute to the NFSv4.2 protocol.
> 
> The detection of that NOCACHE attribute would cause the client to
> automatically choose O_DIRECT on file open, overriding the default
> buffered I/O model. So this would allow the user or sysadmin to specify
> at file creation time that this file will be used for purposes that are
> incompatible with caching.

Can we please come up with coherent semantics for this on the fsdevel
list?  We've had quite a few requests for something similar for local
file systems as well.  The important fine points are things like keeping
the cache coherent if there already is one and only bypassing it
for new data (or alternatively writing it back / invalidating it
beforehand), and the lack of alignment requirements that O_DIRECT
usually has (although that doesn't apply to NFS).
Christoph Hellwig June 20, 2024, 5:27 a.m. UTC | #10
On Wed, Jun 19, 2024 at 05:31:08PM +0300, Sagi Grimberg wrote:
> > buffered I/O model. So this would allow the user or sysadmin to specify
> > at file creation time that this file will be used for purposes that are
> > incompatible with caching.
> 
> user/sysadmin as in not the client? setting this out-of-band?
> That does not work where the application and the sysadmin do not know about
> each other (i.e. in a cloud environment).
> 
> The use-case that is described here cannot be mandated by the server because
> the file usage pattern is really driven by the application.

The way I understood Trond is that it is set on the client, but the
application or and out of band tool.  Think of the attributes set by
chattr.
Sagi Grimberg June 20, 2024, 6:41 a.m. UTC | #11
On 20/06/2024 8:27, Christoph Hellwig wrote:
> On Wed, Jun 19, 2024 at 05:31:08PM +0300, Sagi Grimberg wrote:
>>> buffered I/O model. So this would allow the user or sysadmin to specify
>>> at file creation time that this file will be used for purposes that are
>>> incompatible with caching.
>> user/sysadmin as in not the client? setting this out-of-band?
>> That does not work where the application and the sysadmin do not know about
>> each other (i.e. in a cloud environment).
>>
>> The use-case that is described here cannot be mandated by the server because
>> the file usage pattern is really driven by the application.
> The way I understood Trond is that it is set on the client, but the
> application or and out of band tool.  Think of the attributes set by
> chattr.
>
>

Got it. thanks.
diff mbox series

Patch

diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 6c9f3f6645dd..509718bc5b24 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -49,6 +49,7 @@  enum nfs_param {
 	Opt_bsize,
 	Opt_clientaddr,
 	Opt_cto,
+	Opt_extend,
 	Opt_fg,
 	Opt_fscache,
 	Opt_fscache_flag,
@@ -149,6 +150,7 @@  static const struct fs_parameter_spec nfs_fs_parameters[] = {
 	fsparam_u32   ("bsize",		Opt_bsize),
 	fsparam_string("clientaddr",	Opt_clientaddr),
 	fsparam_flag_no("cto",		Opt_cto),
+	fsparam_flag_no("extend",	Opt_extend),
 	fsparam_flag  ("fg",		Opt_fg),
 	fsparam_flag_no("fsc",		Opt_fscache_flag),
 	fsparam_string("fsc",		Opt_fscache),
@@ -592,6 +594,12 @@  static int nfs_fs_context_parse_param(struct fs_context *fc,
 		else
 			ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY;
 		break;
+	case Opt_extend:
+		if (result.negated)
+			ctx->flags |= NFS_MOUNT_NO_EXTEND;
+		else
+			ctx->flags &= ~NFS_MOUNT_NO_EXTEND;
+		break;
 	case Opt_ac:
 		if (result.negated)
 			ctx->flags |= NFS_MOUNT_NOAC;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index cbbd4866b0b7..f27fd3858913 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -549,6 +549,9 @@  static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	else
 		seq_puts(m, ",local_lock=posix");
 
+	if (nfss->flags & NFS_MOUNT_NO_EXTEND)
+		seq_puts(m, ",noextend");
+
 	if (nfss->flags & NFS_MOUNT_WRITE_EAGER) {
 		if (nfss->flags & NFS_MOUNT_WRITE_WAIT)
 			seq_puts(m, ",write=wait");
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2329cbb0e446..ed76c317b349 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1315,7 +1315,10 @@  static int nfs_can_extend_write(struct file *file, struct folio *folio,
 	struct file_lock_context *flctx = locks_inode_context(inode);
 	struct file_lock *fl;
 	int ret;
+	unsigned int mntflags = NFS_SERVER(inode)->flags;
 
+	if (mntflags & NFS_MOUNT_NO_EXTEND)
+		return 0;
 	if (file->f_flags & O_DSYNC)
 		return 0;
 	if (!nfs_folio_write_uptodate(folio, pagelen))
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 92de074e63b9..f6d8a4f63e50 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -157,6 +157,7 @@  struct nfs_server {
 #define NFS_MOUNT_WRITE_WAIT		0x02000000
 #define NFS_MOUNT_TRUNK_DISCOVERY	0x04000000
 #define NFS_MOUNT_SHUTDOWN			0x08000000
+#define NFS_MOUNT_NO_EXTEND		0x10000000
 
 	unsigned int		fattr_valid;	/* Valid attributes */
 	unsigned int		caps;		/* server capabilities */