diff mbox series

NFS: add atomic_open for NFSv3 to handle O_TRUNC correctly.

Message ID 171134496555.13576.1334297096866165638@noble.neil.brown.name (mailing list archive)
State New, archived
Headers show
Series NFS: add atomic_open for NFSv3 to handle O_TRUNC correctly. | expand

Commit Message

NeilBrown March 25, 2024, 5:36 a.m. UTC
With two clients, each with NFSv3 mounts of the same directory, the sequence:

   client1            client2
  ls -l afile
                      echo hello there > afile
  echo HELLO > afile
  cat afile

will show
   HELLO
   there

because the O_TRUNC requested in the final 'echo' doesn't take effect.
This is because the "Negative dentry, just create a file" section in
lookup_open() assumes that the file *does* get created since the dentry
was negative, so it sets FMODE_CREATED, and this causes do_open() to
clear O_TRUNC and so the file doesn't get truncated.

Even mounting with -o lookupcache=none does not help as
nfs_neg_need_reval() always returns false if LOOKUP_CREATE is set.

This patch fixes the problem by providing an atomic_open inode operation
for NFSv3 (and v2).  The code is largely the code from the branch in
lookup_open() when atomic_open is not provided.  The significant change
is that the O_TRUNC flag is passed a new nfs_do_create() which add
'trunc' handling to nfs_create().

With this change we also optimise away an unnecessary LOOKUP before the
file is created.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfs/dir.c           | 54 +++++++++++++++++++++++++++++++++++++++---
 fs/nfs/nfs3proc.c      |  1 +
 fs/nfs/proc.c          |  1 +
 include/linux/nfs_fs.h |  3 +++
 4 files changed, 56 insertions(+), 3 deletions(-)

Comments

NeilBrown April 8, 2024, 2:14 a.m. UTC | #1
Hi,
 any thoughts about this patch and/or the problem it addresses?

Thanks,
NeilBrown

On Mon, 25 Mar 2024, NeilBrown wrote:
> With two clients, each with NFSv3 mounts of the same directory, the sequence:
> 
>    client1            client2
>   ls -l afile
>                       echo hello there > afile
>   echo HELLO > afile
>   cat afile
> 
> will show
>    HELLO
>    there
> 
> because the O_TRUNC requested in the final 'echo' doesn't take effect.
> This is because the "Negative dentry, just create a file" section in
> lookup_open() assumes that the file *does* get created since the dentry
> was negative, so it sets FMODE_CREATED, and this causes do_open() to
> clear O_TRUNC and so the file doesn't get truncated.
> 
> Even mounting with -o lookupcache=none does not help as
> nfs_neg_need_reval() always returns false if LOOKUP_CREATE is set.
> 
> This patch fixes the problem by providing an atomic_open inode operation
> for NFSv3 (and v2).  The code is largely the code from the branch in
> lookup_open() when atomic_open is not provided.  The significant change
> is that the O_TRUNC flag is passed a new nfs_do_create() which add
> 'trunc' handling to nfs_create().
> 
> With this change we also optimise away an unnecessary LOOKUP before the
> file is created.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfs/dir.c           | 54 +++++++++++++++++++++++++++++++++++++++---
>  fs/nfs/nfs3proc.c      |  1 +
>  fs/nfs/proc.c          |  1 +
>  include/linux/nfs_fs.h |  3 +++
>  4 files changed, 56 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index ac505671efbd..342930996226 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -56,6 +56,8 @@ static int nfs_readdir(struct file *, struct dir_context *);
>  static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
>  static loff_t nfs_llseek_dir(struct file *, loff_t, int);
>  static void nfs_readdir_clear_array(struct folio *);
> +static int nfs_do_create(struct inode *dir, struct dentry *dentry,
> +			 umode_t mode, int open_flags);
>  
>  const struct file_operations nfs_dir_operations = {
>  	.llseek		= nfs_llseek_dir,
> @@ -2243,6 +2245,41 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
>  
>  #endif /* CONFIG_NFSV4 */
>  
> +int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
> +			struct file *file, unsigned int open_flags,
> +			umode_t mode)
> +{
> +
> +	/* Same as look+open from lookup_open(), but with different O_TRUNC
> +	 * handling.
> +	 */
> +	int error = 0;
> +
> +	if (open_flags & O_CREAT) {
> +		file->f_mode |= FMODE_CREATED;
> +		error = nfs_do_create(dir, dentry, mode, open_flags);
> +		if (error)
> +			return error;
> +		return finish_open(file, dentry, NULL);
> +	} else if (d_in_lookup(dentry)) {
> +		/* The only flags nfs_lookup considers are
> +		 * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
> +		 * we want those to be zero so the lookup isn't skipped.
> +		 */
> +		struct dentry *res = nfs_lookup(dir, dentry, 0);
> +
> +		d_lookup_done(dentry);
> +		if (unlikely(res)) {
> +			if (IS_ERR(res))
> +				return PTR_ERR(res);
> +			return finish_no_open(file, res);
> +		}
> +	}
> +	return finish_no_open(file, NULL);
> +
> +}
> +EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
> +
>  struct dentry *
>  nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
>  				struct nfs_fattr *fattr)
> @@ -2303,18 +2340,23 @@ EXPORT_SYMBOL_GPL(nfs_instantiate);
>   * that the operation succeeded on the server, but an error in the
>   * reply path made it appear to have failed.
>   */
> -int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
> -	       struct dentry *dentry, umode_t mode, bool excl)
> +static int nfs_do_create(struct inode *dir, struct dentry *dentry,
> +			 umode_t mode, int open_flags)
>  {
>  	struct iattr attr;
> -	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
>  	int error;
>  
> +	open_flags |= O_CREAT;
> +
>  	dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
>  			dir->i_sb->s_id, dir->i_ino, dentry);
>  
>  	attr.ia_mode = mode;
>  	attr.ia_valid = ATTR_MODE;
> +	if (open_flags & O_TRUNC) {
> +		attr.ia_size = 0;
> +		attr.ia_valid |= ATTR_SIZE;
> +	}
>  
>  	trace_nfs_create_enter(dir, dentry, open_flags);
>  	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
> @@ -2326,6 +2368,12 @@ int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
>  	d_drop(dentry);
>  	return error;
>  }
> +
> +int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
> +	       struct dentry *dentry, umode_t mode, bool excl)
> +{
> +	return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0);
> +}
>  EXPORT_SYMBOL_GPL(nfs_create);
>  
>  /*
> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
> index cbbe3f0193b8..74bda639a7cf 100644
> --- a/fs/nfs/nfs3proc.c
> +++ b/fs/nfs/nfs3proc.c
> @@ -986,6 +986,7 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
>  
>  static const struct inode_operations nfs3_dir_inode_operations = {
>  	.create		= nfs_create,
> +	.atomic_open	= nfs_atomic_open_v23,
>  	.lookup		= nfs_lookup,
>  	.link		= nfs_link,
>  	.unlink		= nfs_unlink,
> diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
> index ad3a321ae997..d105e5b2659d 100644
> --- a/fs/nfs/proc.c
> +++ b/fs/nfs/proc.c
> @@ -695,6 +695,7 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags)
>  static const struct inode_operations nfs_dir_inode_operations = {
>  	.create		= nfs_create,
>  	.lookup		= nfs_lookup,
> +	.atomic_open	= nfs_atomic_open_v23,
>  	.link		= nfs_link,
>  	.unlink		= nfs_unlink,
>  	.symlink	= nfs_symlink,
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index d59116ac8209..039898d70954 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -561,6 +561,9 @@ extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openfl
>  extern void nfs_access_zap_cache(struct inode *inode);
>  extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred,
>  				 u32 *mask, bool may_block);
> +extern int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
> +			       struct file *file, unsigned int open_flags,
> +			       umode_t mode);
>  
>  /*
>   * linux/fs/nfs/symlink.c
> -- 
> 2.44.0
> 
> 
>
James Clark May 28, 2024, 10:52 a.m. UTC | #2
On Mon, 25 Mar 2024, NeilBrown wrote:
> With two clients, each with NFSv3 mounts of the same directory, the sequence:
> 
>    client1            client2
>   ls -l afile
>                       echo hello there > afile
>   echo HELLO > afile
>   cat afile
> 
> will show
>    HELLO
>    there
> 
> because the O_TRUNC requested in the final 'echo' doesn't take effect.
> This is because the "Negative dentry, just create a file" section in
> lookup_open() assumes that the file *does* get created since the dentry
> was negative, so it sets FMODE_CREATED, and this causes do_open() to
> clear O_TRUNC and so the file doesn't get truncated.
> 
> Even mounting with -o lookupcache=none does not help as
> nfs_neg_need_reval() always returns false if LOOKUP_CREATE is set.
> 
> This patch fixes the problem by providing an atomic_open inode operation
> for NFSv3 (and v2).  The code is largely the code from the branch in
> lookup_open() when atomic_open is not provided.  The significant change
> is that the O_TRUNC flag is passed a new nfs_do_create() which add
> 'trunc' handling to nfs_create().
> 
> With this change we also optimise away an unnecessary LOOKUP before the
> file is created.
> 

Hi Neil,

There's an LTP test failure that Aishwarya bisected back to this patch.
Possibly because the new function doesn't check the max filename length
before hitting this warning in encode_filename3():

  WARN_ON_ONCE(length > NFS3_MAXNAMLEN);

I saw some old commit message that mentioned callers should be
checking it, so it seems like a plausible bisect because the test is
testing invalid name lengths. But I didn't look in any more detail than
that.

statvfs01 and inotify02 tests are failing. The full output is at the
end.

Thanks
James

statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) p<4>[ 7735.368939] ------------[ cut here ]------------
assed
<4>[ 7735.376605] WARNING: CPU: 3 PID: 387286 at fs/nfs/nfs3xdr.c:188 encode_filename3+0x44/0x4c
statvfs01.c:44: TPASS: creat(vali<4>[ 7735.385773] Modules linked in: quota_v2 quota_tree dummy veth overlay binfmt_misc btrfs blake2b_generic libcrc32c xor xor_neon raid6_pq zstd_compress fuse drm backlight ip_tables x_tables ipv6 crct10dif_ce onboard_usb_dev smsc [last unloaded: binfmt_misc]
d_fname, 0444) returned fd 3<4>[ 7735.411537] CPU: 3 PID: 387286 Comm: statvfs01 Not tainted 6.9.0-next-20240523 #1

<4>[ 7735.421719] Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform, BIOS EDK II Jan 30 2024
<4>[ 7735.432948] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
<4>[ 7735.440191] pc : encode_filename3+0x44/0x4c
<4>[ 7735.444650] lr : nfs3_xdr_enc_create3args+0x44/0xf8
<4>[ 7735.449807] sp : ffff800088703800
<4>[ 7735.453386] x29: ffff800088703800 x28: ffff8000826b6388 x27: ffff800088703c70
<4>[ 7735.460812] x26: ffff00082752f780 x25: 0000000000000001 x24: 0000000000440040
<4>[ 7735.468238] x23: 0000000000000100 x22: ffff000827520010 x21: ffff00082712b600
<4>[ 7735.475663] x20: ffff000827520010 x19: 0000000000000100 x18: 0000000000000000
<4>[ 7735.483088] x17: 0000000000000000 x16: 0000000000000000 x15: 6262626262626262
<4>[ 7735.490512] x14: 6262626262626262 x13: 14269a2303320025 x12: 43956040606e0dbf
<4>[ 7735.497937] x11: ce4b9be95ac6e1de x10: 00000000017009cd x9 : 0332002543956040
<4>[ 7735.505362] x8 : 606e0dbfce4b9be9 x7 : 5ac6e1de00000000 x6 : 017009cd01070001
<4>[ 7735.512786] x5 : ffff00082589b87c x4 : ffff00082410216e x3 : ffff800080445400
<4>[ 7735.520211] x2 : 0000000000000100 x1 : ffff000827520010 x0 : ffff8000887038b8
<4>[ 7735.527636] Call trace:
<4>[ 7735.530345]  encode_filename3+0x44/0x4c
<4>[ 7735.534457]  nfs3_xdr_enc_create3args+0x44/0xf8
<4>[ 7735.539264]  rpcauth_wrap_req_encode+0x1c/0x2c
<4>[ 7735.543986]  rpcauth_wrap_req+0x20/0x2c
<4>[ 7735.548097]  call_encode+0x114/0x294
<4>[ 7735.551947]  __rpc_execute+0xb0/0x3a0
<4>[ 7735.555883]  rpc_execute+0x9c/0xbc
<4>[ 7735.559557]  rpc_run_task+0x128/0x1cc
<4>[ 7735.563494]  rpc_call_sync+0x58/0xb8
<4>[ 7735.567343]  nfs3_rpc_wrapper+0x3c/0x84
<4>[ 7735.571454]  nfs3_proc_create+0xb0/0x2cc
<4>[ 7735.575651]  nfs_atomic_open_v23+0xfc/0x14c
<4>[ 7735.580107]  path_openat+0x64c/0xee0
<4>[ 7735.583957]  do_filp_open+0x80/0x12c
<4>[ 7735.587806]  do_sys_openat2+0xb4/0xe8
<4>[ 7735.591739]  __arm64_sys_openat+0x64/0xac
<4>[ 7735.596021]  invoke_syscall+0x48/0x118
<4>[ 7735.600047]  el0_svc_common.constprop.0+0x40/0xe0
<4>[ 7735.605029]  do_el0_svc+0x1c/0x28
<4>[ 7735.608617]  el0_svc+0x34/0xdc
<4>[ 7735.611944]  el0t_64_sync_handler+0xc0/0xc4
<4>[ 7735.616404]  el0t_64_sync+0x190/0x194
<4>[ 7735.620339] ---[ end trace 0000000000000000 ]---
statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLONG: EIO (5)
<6>[ 7735.689265] EXT4-fs (loop0): unmounting filesystem 99c92af1-0341-4dd6-920c-bc7461170ff2.
tst_test.c:1650: TINFO: === Testing on ext3 ===
tst_test.c:1105: TINFO: Formatting /dev/loop0 with ext3 opts='' extra opts=''
mke2fs 1.46.2 (28-Feb-2021)
tst_test.c:1119: TINFO: Mounting /<6>[ 7737.794577] EXT4-fs (loop0): mounting ext3 file system using the ext4 subsystem
dev/loop0 to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint fstyp=ext3 flags=0
<6>[ 7737.818721] EXT4-fs (loop0): mounted filesystem 91699e00-ff2a-49b3-8159-38ae80bdd87d r/w with ordered data mode. Quota mode: none.
<4>[ 7737.830824] ext3 filesystem being mounted at /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint supports timestamps until 2038-01-19 (0x7fffffff)
statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLON<6>[ 7737.861118] EXT4-fs (loop0): unmounting filesystem 91699e00-ff2a-49b3-8159-38ae80bdd87d.
G: EIO (5)
tst_test.c:1650: TINFO: === Testing on ext4 ===
tst_test.c:1105: TINFO: Formatting /dev/loop0 with ext4 opts='' extra opts=''
mke2fs 1.46.2 (28-Feb-2021)
tst_test.c:1119: TINFO: Mounting /dev/loop0 to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdp<6>[ 7738.791447] EXT4-fs (loop0): mounted filesystem fe0ff94a-6c04-4158-9b0e-069ac82b6c8d r/w with ordered data mode. Quota mode: none.
zv/mntpoint fstyp=ext4 flags=0
<4>[ 7738.805833] ext4 filesystem being mounted at /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint supports timestamps until 2038-01-19 (0x7fffffff)
statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
statvfs01.c:48: TFAIL: creat(toolong_fname, 0<6>[ 7738.837563] EXT4-fs (loop0): unmounting filesystem fe0ff94a-6c04-4158-9b0e-069ac82b6c8d.
444) expected ENAMETOOLONG: EIO (5)
tst_test.c:1650: TINFO: === Testing on tmpfs ===
tst_test.c:1105: TINFO: Skipping mkfs for TMPFS filesystem
tst_test.c:1086: TINFO: Limiting tmpfs size to 32MB
tst_test.c:1119: TINFO: Mounting ltp-tmpfs to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint fstyp=tmpfs flags=0
statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLONG: EIO (5)
Summary:
passed   8
failed   4

inotify02.c:165: TPASS: get event: wd=1 mask=40000004 cookie=0     len=0  name=\"\"
inotify02.c:181: TFAIL: get event: wd=1 mask=00000020 (expected 100) cookie=0     len=16 name=\"test_file1\" (expected \"test_file1\") 0
inotify02.c:181: TFAIL: get event: wd=1 mask=00000100 (expected 20) cookie=0     len=16 name=\"test_file1\" (expected \"test_file1\") 0
inotify02.c:165: TPASS: get event: wd=1 mask=00000008 cookie=0     len=16 name=\"test_file1\"
inotify02.c:165: TPASS: get event: wd=1 mask=00000040 cookie=5537  len=16 name=\"test_file1\"
inotify02.c:165: TPASS: get event: wd=1 mask=00000080 cookie=5537  len=16 name=\"test_file2\"
inotify02.c:165: TPASS: get event: wd=1 mask=00000800 cookie=0     len=0  name=\"\"
inotify02.c:165: TPASS: get event: wd=1 mask=00000200 cookie=0     len=16 name=\"test_file2\"
inotify02.c:165: TPASS: get event: wd=1 mask=00000800 cookie=0     len=0  name=\"\"
NeilBrown May 28, 2024, 9:50 p.m. UTC | #3
On Tue, 28 May 2024, James Clark wrote:
> On Mon, 25 Mar 2024, NeilBrown wrote:
> > With two clients, each with NFSv3 mounts of the same directory, the sequence:
> > 
> >    client1            client2
> >   ls -l afile
> >                       echo hello there > afile
> >   echo HELLO > afile
> >   cat afile
> > 
> > will show
> >    HELLO
> >    there
> > 
> > because the O_TRUNC requested in the final 'echo' doesn't take effect.
> > This is because the "Negative dentry, just create a file" section in
> > lookup_open() assumes that the file *does* get created since the dentry
> > was negative, so it sets FMODE_CREATED, and this causes do_open() to
> > clear O_TRUNC and so the file doesn't get truncated.
> > 
> > Even mounting with -o lookupcache=none does not help as
> > nfs_neg_need_reval() always returns false if LOOKUP_CREATE is set.
> > 
> > This patch fixes the problem by providing an atomic_open inode operation
> > for NFSv3 (and v2).  The code is largely the code from the branch in
> > lookup_open() when atomic_open is not provided.  The significant change
> > is that the O_TRUNC flag is passed a new nfs_do_create() which add
> > 'trunc' handling to nfs_create().
> > 
> > With this change we also optimise away an unnecessary LOOKUP before the
> > file is created.
> > 
> 
> Hi Neil,
> 
> There's an LTP test failure that Aishwarya bisected back to this patch.
> Possibly because the new function doesn't check the max filename length
> before hitting this warning in encode_filename3():
> 
>   WARN_ON_ONCE(length > NFS3_MAXNAMLEN);

Thanks.  I know about that.  I fixed it in our SUSE kernels but I
thought I was being ignored by upstream as I never got any reply at all
so I never got around to resubmitting upstream.  I'll post a patch.

Thanks,
NeilBrown


> 
> I saw some old commit message that mentioned callers should be
> checking it, so it seems like a plausible bisect because the test is
> testing invalid name lengths. But I didn't look in any more detail than
> that.
> 
> statvfs01 and inotify02 tests are failing. The full output is at the
> end.
> 
> Thanks
> James
> 
> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) p<4>[ 7735.368939] ------------[ cut here ]------------
> assed
> <4>[ 7735.376605] WARNING: CPU: 3 PID: 387286 at fs/nfs/nfs3xdr.c:188 encode_filename3+0x44/0x4c
> statvfs01.c:44: TPASS: creat(vali<4>[ 7735.385773] Modules linked in: quota_v2 quota_tree dummy veth overlay binfmt_misc btrfs blake2b_generic libcrc32c xor xor_neon raid6_pq zstd_compress fuse drm backlight ip_tables x_tables ipv6 crct10dif_ce onboard_usb_dev smsc [last unloaded: binfmt_misc]
> d_fname, 0444) returned fd 3<4>[ 7735.411537] CPU: 3 PID: 387286 Comm: statvfs01 Not tainted 6.9.0-next-20240523 #1
> 
> <4>[ 7735.421719] Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform, BIOS EDK II Jan 30 2024
> <4>[ 7735.432948] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> <4>[ 7735.440191] pc : encode_filename3+0x44/0x4c
> <4>[ 7735.444650] lr : nfs3_xdr_enc_create3args+0x44/0xf8
> <4>[ 7735.449807] sp : ffff800088703800
> <4>[ 7735.453386] x29: ffff800088703800 x28: ffff8000826b6388 x27: ffff800088703c70
> <4>[ 7735.460812] x26: ffff00082752f780 x25: 0000000000000001 x24: 0000000000440040
> <4>[ 7735.468238] x23: 0000000000000100 x22: ffff000827520010 x21: ffff00082712b600
> <4>[ 7735.475663] x20: ffff000827520010 x19: 0000000000000100 x18: 0000000000000000
> <4>[ 7735.483088] x17: 0000000000000000 x16: 0000000000000000 x15: 6262626262626262
> <4>[ 7735.490512] x14: 6262626262626262 x13: 14269a2303320025 x12: 43956040606e0dbf
> <4>[ 7735.497937] x11: ce4b9be95ac6e1de x10: 00000000017009cd x9 : 0332002543956040
> <4>[ 7735.505362] x8 : 606e0dbfce4b9be9 x7 : 5ac6e1de00000000 x6 : 017009cd01070001
> <4>[ 7735.512786] x5 : ffff00082589b87c x4 : ffff00082410216e x3 : ffff800080445400
> <4>[ 7735.520211] x2 : 0000000000000100 x1 : ffff000827520010 x0 : ffff8000887038b8
> <4>[ 7735.527636] Call trace:
> <4>[ 7735.530345]  encode_filename3+0x44/0x4c
> <4>[ 7735.534457]  nfs3_xdr_enc_create3args+0x44/0xf8
> <4>[ 7735.539264]  rpcauth_wrap_req_encode+0x1c/0x2c
> <4>[ 7735.543986]  rpcauth_wrap_req+0x20/0x2c
> <4>[ 7735.548097]  call_encode+0x114/0x294
> <4>[ 7735.551947]  __rpc_execute+0xb0/0x3a0
> <4>[ 7735.555883]  rpc_execute+0x9c/0xbc
> <4>[ 7735.559557]  rpc_run_task+0x128/0x1cc
> <4>[ 7735.563494]  rpc_call_sync+0x58/0xb8
> <4>[ 7735.567343]  nfs3_rpc_wrapper+0x3c/0x84
> <4>[ 7735.571454]  nfs3_proc_create+0xb0/0x2cc
> <4>[ 7735.575651]  nfs_atomic_open_v23+0xfc/0x14c
> <4>[ 7735.580107]  path_openat+0x64c/0xee0
> <4>[ 7735.583957]  do_filp_open+0x80/0x12c
> <4>[ 7735.587806]  do_sys_openat2+0xb4/0xe8
> <4>[ 7735.591739]  __arm64_sys_openat+0x64/0xac
> <4>[ 7735.596021]  invoke_syscall+0x48/0x118
> <4>[ 7735.600047]  el0_svc_common.constprop.0+0x40/0xe0
> <4>[ 7735.605029]  do_el0_svc+0x1c/0x28
> <4>[ 7735.608617]  el0_svc+0x34/0xdc
> <4>[ 7735.611944]  el0t_64_sync_handler+0xc0/0xc4
> <4>[ 7735.616404]  el0t_64_sync+0x190/0x194
> <4>[ 7735.620339] ---[ end trace 0000000000000000 ]---
> statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLONG: EIO (5)
> <6>[ 7735.689265] EXT4-fs (loop0): unmounting filesystem 99c92af1-0341-4dd6-920c-bc7461170ff2.
> tst_test.c:1650: TINFO: === Testing on ext3 ===
> tst_test.c:1105: TINFO: Formatting /dev/loop0 with ext3 opts='' extra opts=''
> mke2fs 1.46.2 (28-Feb-2021)
> tst_test.c:1119: TINFO: Mounting /<6>[ 7737.794577] EXT4-fs (loop0): mounting ext3 file system using the ext4 subsystem
> dev/loop0 to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint fstyp=ext3 flags=0
> <6>[ 7737.818721] EXT4-fs (loop0): mounted filesystem 91699e00-ff2a-49b3-8159-38ae80bdd87d r/w with ordered data mode. Quota mode: none.
> <4>[ 7737.830824] ext3 filesystem being mounted at /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint supports timestamps until 2038-01-19 (0x7fffffff)
> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
> statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
> statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLON<6>[ 7737.861118] EXT4-fs (loop0): unmounting filesystem 91699e00-ff2a-49b3-8159-38ae80bdd87d.
> G: EIO (5)
> tst_test.c:1650: TINFO: === Testing on ext4 ===
> tst_test.c:1105: TINFO: Formatting /dev/loop0 with ext4 opts='' extra opts=''
> mke2fs 1.46.2 (28-Feb-2021)
> tst_test.c:1119: TINFO: Mounting /dev/loop0 to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdp<6>[ 7738.791447] EXT4-fs (loop0): mounted filesystem fe0ff94a-6c04-4158-9b0e-069ac82b6c8d r/w with ordered data mode. Quota mode: none.
> zv/mntpoint fstyp=ext4 flags=0
> <4>[ 7738.805833] ext4 filesystem being mounted at /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint supports timestamps until 2038-01-19 (0x7fffffff)
> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
> statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
> statvfs01.c:48: TFAIL: creat(toolong_fname, 0<6>[ 7738.837563] EXT4-fs (loop0): unmounting filesystem fe0ff94a-6c04-4158-9b0e-069ac82b6c8d.
> 444) expected ENAMETOOLONG: EIO (5)
> tst_test.c:1650: TINFO: === Testing on tmpfs ===
> tst_test.c:1105: TINFO: Skipping mkfs for TMPFS filesystem
> tst_test.c:1086: TINFO: Limiting tmpfs size to 32MB
> tst_test.c:1119: TINFO: Mounting ltp-tmpfs to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint fstyp=tmpfs flags=0
> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
> statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
> statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLONG: EIO (5)
> Summary:
> passed   8
> failed   4
> 
> inotify02.c:165: TPASS: get event: wd=1 mask=40000004 cookie=0     len=0  name=\"\"
> inotify02.c:181: TFAIL: get event: wd=1 mask=00000020 (expected 100) cookie=0     len=16 name=\"test_file1\" (expected \"test_file1\") 0
> inotify02.c:181: TFAIL: get event: wd=1 mask=00000100 (expected 20) cookie=0     len=16 name=\"test_file1\" (expected \"test_file1\") 0
> inotify02.c:165: TPASS: get event: wd=1 mask=00000008 cookie=0     len=16 name=\"test_file1\"
> inotify02.c:165: TPASS: get event: wd=1 mask=00000040 cookie=5537  len=16 name=\"test_file1\"
> inotify02.c:165: TPASS: get event: wd=1 mask=00000080 cookie=5537  len=16 name=\"test_file2\"
> inotify02.c:165: TPASS: get event: wd=1 mask=00000800 cookie=0     len=0  name=\"\"
> inotify02.c:165: TPASS: get event: wd=1 mask=00000200 cookie=0     len=16 name=\"test_file2\"
> inotify02.c:165: TPASS: get event: wd=1 mask=00000800 cookie=0     len=0  name=\"\"
>
James Clark June 7, 2024, 10:49 a.m. UTC | #4
On 28/05/2024 22:50, NeilBrown wrote:
> On Tue, 28 May 2024, James Clark wrote:
>> On Mon, 25 Mar 2024, NeilBrown wrote:
>>> With two clients, each with NFSv3 mounts of the same directory, the sequence:
>>>
>>>    client1            client2
>>>   ls -l afile
>>>                       echo hello there > afile
>>>   echo HELLO > afile
>>>   cat afile
>>>
>>> will show
>>>    HELLO
>>>    there
>>>
>>> because the O_TRUNC requested in the final 'echo' doesn't take effect.
>>> This is because the "Negative dentry, just create a file" section in
>>> lookup_open() assumes that the file *does* get created since the dentry
>>> was negative, so it sets FMODE_CREATED, and this causes do_open() to
>>> clear O_TRUNC and so the file doesn't get truncated.
>>>
>>> Even mounting with -o lookupcache=none does not help as
>>> nfs_neg_need_reval() always returns false if LOOKUP_CREATE is set.
>>>
>>> This patch fixes the problem by providing an atomic_open inode operation
>>> for NFSv3 (and v2).  The code is largely the code from the branch in
>>> lookup_open() when atomic_open is not provided.  The significant change
>>> is that the O_TRUNC flag is passed a new nfs_do_create() which add
>>> 'trunc' handling to nfs_create().
>>>
>>> With this change we also optimise away an unnecessary LOOKUP before the
>>> file is created.
>>>
>>
>> Hi Neil,
>>
>> There's an LTP test failure that Aishwarya bisected back to this patch.
>> Possibly because the new function doesn't check the max filename length
>> before hitting this warning in encode_filename3():
>>
>>   WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
> 
> Thanks.  I know about that.  I fixed it in our SUSE kernels but I
> thought I was being ignored by upstream as I never got any reply at all
> so I never got around to resubmitting upstream.  I'll post a patch.
> 
> Thanks,
> NeilBrown
> 
> 

Hi Neil,

Now that your fix is in linux-next the statvfs01 test is passing again.
However inotify02 is still failing.

This is because the test expects the IN_CREATE and IN_OPEN events to
come in that order after calling creat(), but now they are reversed. To
me it seems like it could be a test issue and the test should handle
them in either order? Or maybe there should be a single inotify event
with both flags set for the atomic open?

Thanks
James

>>
>> I saw some old commit message that mentioned callers should be
>> checking it, so it seems like a plausible bisect because the test is
>> testing invalid name lengths. But I didn't look in any more detail than
>> that.
>>
>> statvfs01 and inotify02 tests are failing. The full output is at the
>> end.
>>
>> Thanks
>> James
>>
>> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) p<4>[ 7735.368939] ------------[ cut here ]------------
>> assed
>> <4>[ 7735.376605] WARNING: CPU: 3 PID: 387286 at fs/nfs/nfs3xdr.c:188 encode_filename3+0x44/0x4c
>> statvfs01.c:44: TPASS: creat(vali<4>[ 7735.385773] Modules linked in: quota_v2 quota_tree dummy veth overlay binfmt_misc btrfs blake2b_generic libcrc32c xor xor_neon raid6_pq zstd_compress fuse drm backlight ip_tables x_tables ipv6 crct10dif_ce onboard_usb_dev smsc [last unloaded: binfmt_misc]
>> d_fname, 0444) returned fd 3<4>[ 7735.411537] CPU: 3 PID: 387286 Comm: statvfs01 Not tainted 6.9.0-next-20240523 #1
>>
>> <4>[ 7735.421719] Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform, BIOS EDK II Jan 30 2024
>> <4>[ 7735.432948] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>> <4>[ 7735.440191] pc : encode_filename3+0x44/0x4c
>> <4>[ 7735.444650] lr : nfs3_xdr_enc_create3args+0x44/0xf8
>> <4>[ 7735.449807] sp : ffff800088703800
>> <4>[ 7735.453386] x29: ffff800088703800 x28: ffff8000826b6388 x27: ffff800088703c70
>> <4>[ 7735.460812] x26: ffff00082752f780 x25: 0000000000000001 x24: 0000000000440040
>> <4>[ 7735.468238] x23: 0000000000000100 x22: ffff000827520010 x21: ffff00082712b600
>> <4>[ 7735.475663] x20: ffff000827520010 x19: 0000000000000100 x18: 0000000000000000
>> <4>[ 7735.483088] x17: 0000000000000000 x16: 0000000000000000 x15: 6262626262626262
>> <4>[ 7735.490512] x14: 6262626262626262 x13: 14269a2303320025 x12: 43956040606e0dbf
>> <4>[ 7735.497937] x11: ce4b9be95ac6e1de x10: 00000000017009cd x9 : 0332002543956040
>> <4>[ 7735.505362] x8 : 606e0dbfce4b9be9 x7 : 5ac6e1de00000000 x6 : 017009cd01070001
>> <4>[ 7735.512786] x5 : ffff00082589b87c x4 : ffff00082410216e x3 : ffff800080445400
>> <4>[ 7735.520211] x2 : 0000000000000100 x1 : ffff000827520010 x0 : ffff8000887038b8
>> <4>[ 7735.527636] Call trace:
>> <4>[ 7735.530345]  encode_filename3+0x44/0x4c
>> <4>[ 7735.534457]  nfs3_xdr_enc_create3args+0x44/0xf8
>> <4>[ 7735.539264]  rpcauth_wrap_req_encode+0x1c/0x2c
>> <4>[ 7735.543986]  rpcauth_wrap_req+0x20/0x2c
>> <4>[ 7735.548097]  call_encode+0x114/0x294
>> <4>[ 7735.551947]  __rpc_execute+0xb0/0x3a0
>> <4>[ 7735.555883]  rpc_execute+0x9c/0xbc
>> <4>[ 7735.559557]  rpc_run_task+0x128/0x1cc
>> <4>[ 7735.563494]  rpc_call_sync+0x58/0xb8
>> <4>[ 7735.567343]  nfs3_rpc_wrapper+0x3c/0x84
>> <4>[ 7735.571454]  nfs3_proc_create+0xb0/0x2cc
>> <4>[ 7735.575651]  nfs_atomic_open_v23+0xfc/0x14c
>> <4>[ 7735.580107]  path_openat+0x64c/0xee0
>> <4>[ 7735.583957]  do_filp_open+0x80/0x12c
>> <4>[ 7735.587806]  do_sys_openat2+0xb4/0xe8
>> <4>[ 7735.591739]  __arm64_sys_openat+0x64/0xac
>> <4>[ 7735.596021]  invoke_syscall+0x48/0x118
>> <4>[ 7735.600047]  el0_svc_common.constprop.0+0x40/0xe0
>> <4>[ 7735.605029]  do_el0_svc+0x1c/0x28
>> <4>[ 7735.608617]  el0_svc+0x34/0xdc
>> <4>[ 7735.611944]  el0t_64_sync_handler+0xc0/0xc4
>> <4>[ 7735.616404]  el0t_64_sync+0x190/0x194
>> <4>[ 7735.620339] ---[ end trace 0000000000000000 ]---
>> statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLONG: EIO (5)
>> <6>[ 7735.689265] EXT4-fs (loop0): unmounting filesystem 99c92af1-0341-4dd6-920c-bc7461170ff2.
>> tst_test.c:1650: TINFO: === Testing on ext3 ===
>> tst_test.c:1105: TINFO: Formatting /dev/loop0 with ext3 opts='' extra opts=''
>> mke2fs 1.46.2 (28-Feb-2021)
>> tst_test.c:1119: TINFO: Mounting /<6>[ 7737.794577] EXT4-fs (loop0): mounting ext3 file system using the ext4 subsystem
>> dev/loop0 to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint fstyp=ext3 flags=0
>> <6>[ 7737.818721] EXT4-fs (loop0): mounted filesystem 91699e00-ff2a-49b3-8159-38ae80bdd87d r/w with ordered data mode. Quota mode: none.
>> <4>[ 7737.830824] ext3 filesystem being mounted at /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint supports timestamps until 2038-01-19 (0x7fffffff)
>> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
>> statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
>> statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLON<6>[ 7737.861118] EXT4-fs (loop0): unmounting filesystem 91699e00-ff2a-49b3-8159-38ae80bdd87d.
>> G: EIO (5)
>> tst_test.c:1650: TINFO: === Testing on ext4 ===
>> tst_test.c:1105: TINFO: Formatting /dev/loop0 with ext4 opts='' extra opts=''
>> mke2fs 1.46.2 (28-Feb-2021)
>> tst_test.c:1119: TINFO: Mounting /dev/loop0 to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdp<6>[ 7738.791447] EXT4-fs (loop0): mounted filesystem fe0ff94a-6c04-4158-9b0e-069ac82b6c8d r/w with ordered data mode. Quota mode: none.
>> zv/mntpoint fstyp=ext4 flags=0
>> <4>[ 7738.805833] ext4 filesystem being mounted at /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint supports timestamps until 2038-01-19 (0x7fffffff)
>> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
>> statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
>> statvfs01.c:48: TFAIL: creat(toolong_fname, 0<6>[ 7738.837563] EXT4-fs (loop0): unmounting filesystem fe0ff94a-6c04-4158-9b0e-069ac82b6c8d.
>> 444) expected ENAMETOOLONG: EIO (5)
>> tst_test.c:1650: TINFO: === Testing on tmpfs ===
>> tst_test.c:1105: TINFO: Skipping mkfs for TMPFS filesystem
>> tst_test.c:1086: TINFO: Limiting tmpfs size to 32MB
>> tst_test.c:1119: TINFO: Mounting ltp-tmpfs to /ltp-tmp/ltp-aTUvKrI1Ui/LTP_stakwdpzv/mntpoint fstyp=tmpfs flags=0
>> statvfs01.c:32: TPASS: statvfs(TEST_PATH, &buf) passed
>> statvfs01.c:44: TPASS: creat(valid_fname, 0444) returned fd 3
>> statvfs01.c:48: TFAIL: creat(toolong_fname, 0444) expected ENAMETOOLONG: EIO (5)
>> Summary:
>> passed   8
>> failed   4
>>
>> inotify02.c:165: TPASS: get event: wd=1 mask=40000004 cookie=0     len=0  name=\"\"
>> inotify02.c:181: TFAIL: get event: wd=1 mask=00000020 (expected 100) cookie=0     len=16 name=\"test_file1\" (expected \"test_file1\") 0
>> inotify02.c:181: TFAIL: get event: wd=1 mask=00000100 (expected 20) cookie=0     len=16 name=\"test_file1\" (expected \"test_file1\") 0
>> inotify02.c:165: TPASS: get event: wd=1 mask=00000008 cookie=0     len=16 name=\"test_file1\"
>> inotify02.c:165: TPASS: get event: wd=1 mask=00000040 cookie=5537  len=16 name=\"test_file1\"
>> inotify02.c:165: TPASS: get event: wd=1 mask=00000080 cookie=5537  len=16 name=\"test_file2\"
>> inotify02.c:165: TPASS: get event: wd=1 mask=00000800 cookie=0     len=0  name=\"\"
>> inotify02.c:165: TPASS: get event: wd=1 mask=00000200 cookie=0     len=16 name=\"test_file2\"
>> inotify02.c:165: TPASS: get event: wd=1 mask=00000800 cookie=0     len=0  name=\"\"
>>
>
NeilBrown June 11, 2024, 2:29 a.m. UTC | #5
On Fri, 07 Jun 2024, James Clark wrote:
> 
> Hi Neil,
> 
> Now that your fix is in linux-next the statvfs01 test is passing again.
> However inotify02 is still failing.
> 
> This is because the test expects the IN_CREATE and IN_OPEN events to
> come in that order after calling creat(), but now they are reversed. To
> me it seems like it could be a test issue and the test should handle
> them in either order? Or maybe there should be a single inotify event
> with both flags set for the atomic open?

Interesting....  I don't see how any filesystem that uses ->atomic_open
would get these in the "right" order - and I do think IN_CREATE should
come before IN_OPEN.

Does NFSv4 pass this test?

IN_OPEN is generated (by fsnotify_open()) when finish_open() is called,
which must be (and is) called by all atomic_open functions.
IN_CREATE is generated (by fsnotify_create()) when open_last_lookups()
detects that FMODE_CREATE was set and that happens *after* lookup_open()
is called, which calls atomic_open().

For filesystems that don't use atomic_open, the IN_OPEN is generated by
the call to do_open() which happens *after* open_last_lookups(), not
inside it.

So the ltp test must already fail for NFSv4, 9p ceph fuse gfs2 ntfs3
overlayfs smb.

Can you confirm is that is, or is not, the case?

Thanks,
NeilBrown
Amir Goldstein June 11, 2024, 6:44 a.m. UTC | #6
On Tue, Jun 11, 2024 at 5:30 AM NeilBrown <neilb@suse.de> wrote:
>
> On Fri, 07 Jun 2024, James Clark wrote:
> >
> > Hi Neil,
> >
> > Now that your fix is in linux-next the statvfs01 test is passing again.
> > However inotify02 is still failing.
> >
> > This is because the test expects the IN_CREATE and IN_OPEN events to
> > come in that order after calling creat(), but now they are reversed. To
> > me it seems like it could be a test issue and the test should handle
> > them in either order? Or maybe there should be a single inotify event
> > with both flags set for the atomic open?
>
> Interesting....  I don't see how any filesystem that uses ->atomic_open
> would get these in the "right" order - and I do think IN_CREATE should
> come before IN_OPEN.

Correct.

>
> Does NFSv4 pass this test?

Probably not.

>
> IN_OPEN is generated (by fsnotify_open()) when finish_open() is called,
> which must be (and is) called by all atomic_open functions.
> IN_CREATE is generated (by fsnotify_create()) when open_last_lookups()
> detects that FMODE_CREATE was set and that happens *after* lookup_open()
> is called, which calls atomic_open().
>
> For filesystems that don't use atomic_open, the IN_OPEN is generated by
> the call to do_open() which happens *after* open_last_lookups(), not
> inside it.

Correct.

>
> So the ltp test must already fail for NFSv4, 9p ceph fuse gfs2 ntfs3
> overlayfs smb.
>

inotify02 does not run on all_filesystems, only on the main test fs,
which is not very often one of the above.

This is how I averted the problem in fanotify16 (which does run on
all_filesystems):

commit 9062824a70b8da74aa5b1db08710d0018b48072e
Author: Amir Goldstein <amir73il@gmail.com>
Date:   Tue Nov 21 12:52:47 2023 +0200

    fanotify16: Fix test failure on FUSE

    Split SAFE_CREAT() into explicit SAFE_MKNOD() and SAFE_OPEN(),
    because with atomic open (e.g. fuse), SAFE_CREAT() generates FAN_OPEN
    before FAN_CREATE (tested with ntfs-3g), which is inconsistent with
    the order of events expected from other filesystems.

inotify02 should be fixed similarly.

I did not find any other inotify test that watches IN_CREATE.
I did not find any other fanotify test that watches both FAN_CREATE
and FAN_OPEN.

Thanks,
Amir.
NeilBrown June 12, 2024, 1:59 a.m. UTC | #7
On Tue, 11 Jun 2024, Amir Goldstein wrote:
> On Tue, Jun 11, 2024 at 5:30 AM NeilBrown <neilb@suse.de> wrote:
> >
> > On Fri, 07 Jun 2024, James Clark wrote:
> > >
> > > Hi Neil,
> > >
> > > Now that your fix is in linux-next the statvfs01 test is passing again.
> > > However inotify02 is still failing.
> > >
> > > This is because the test expects the IN_CREATE and IN_OPEN events to
> > > come in that order after calling creat(), but now they are reversed. To
> > > me it seems like it could be a test issue and the test should handle
> > > them in either order? Or maybe there should be a single inotify event
> > > with both flags set for the atomic open?
> >
> > Interesting....  I don't see how any filesystem that uses ->atomic_open
> > would get these in the "right" order - and I do think IN_CREATE should
> > come before IN_OPEN.
> 
> Correct.
> 
> >
> > Does NFSv4 pass this test?
> 
> Probably not.
> 
> >
> > IN_OPEN is generated (by fsnotify_open()) when finish_open() is called,
> > which must be (and is) called by all atomic_open functions.
> > IN_CREATE is generated (by fsnotify_create()) when open_last_lookups()
> > detects that FMODE_CREATE was set and that happens *after* lookup_open()
> > is called, which calls atomic_open().
> >
> > For filesystems that don't use atomic_open, the IN_OPEN is generated by
> > the call to do_open() which happens *after* open_last_lookups(), not
> > inside it.
> 
> Correct.
> 
> >
> > So the ltp test must already fail for NFSv4, 9p ceph fuse gfs2 ntfs3
> > overlayfs smb.
> >
> 
> inotify02 does not run on all_filesystems, only on the main test fs,
> which is not very often one of the above.
> 
> This is how I averted the problem in fanotify16 (which does run on
> all_filesystems):
> 
> commit 9062824a70b8da74aa5b1db08710d0018b48072e
> Author: Amir Goldstein <amir73il@gmail.com>
> Date:   Tue Nov 21 12:52:47 2023 +0200
> 
>     fanotify16: Fix test failure on FUSE
> 
>     Split SAFE_CREAT() into explicit SAFE_MKNOD() and SAFE_OPEN(),
>     because with atomic open (e.g. fuse), SAFE_CREAT() generates FAN_OPEN
>     before FAN_CREATE (tested with ntfs-3g), which is inconsistent with
>     the order of events expected from other filesystems.
> 
> inotify02 should be fixed similarly.

Alternately - maybe the kernel should be fixed to always get the order
right.
I have a patch.  I'll post it separately.

Thanks for your confirmation that my understanding is correct!

NeilBrown


> 
> I did not find any other inotify test that watches IN_CREATE.
> I did not find any other fanotify test that watches both FAN_CREATE
> and FAN_OPEN.
> 
> Thanks,
> Amir.
>
Petr Vorel June 12, 2024, 7:12 a.m. UTC | #8
> On Tue, 11 Jun 2024, Amir Goldstein wrote:
> > On Tue, Jun 11, 2024 at 5:30 AM NeilBrown <neilb@suse.de> wrote:

> > > On Fri, 07 Jun 2024, James Clark wrote:

> > > > Hi Neil,

> > > > Now that your fix is in linux-next the statvfs01 test is passing again.
> > > > However inotify02 is still failing.

> > > > This is because the test expects the IN_CREATE and IN_OPEN events to
> > > > come in that order after calling creat(), but now they are reversed. To
> > > > me it seems like it could be a test issue and the test should handle
> > > > them in either order? Or maybe there should be a single inotify event
> > > > with both flags set for the atomic open?

> > > Interesting....  I don't see how any filesystem that uses ->atomic_open
> > > would get these in the "right" order - and I do think IN_CREATE should
> > > come before IN_OPEN.

> > Correct.


> > > Does NFSv4 pass this test?

> > Probably not.


> > > IN_OPEN is generated (by fsnotify_open()) when finish_open() is called,
> > > which must be (and is) called by all atomic_open functions.
> > > IN_CREATE is generated (by fsnotify_create()) when open_last_lookups()
> > > detects that FMODE_CREATE was set and that happens *after* lookup_open()
> > > is called, which calls atomic_open().

> > > For filesystems that don't use atomic_open, the IN_OPEN is generated by
> > > the call to do_open() which happens *after* open_last_lookups(), not
> > > inside it.

> > Correct.


> > > So the ltp test must already fail for NFSv4, 9p ceph fuse gfs2 ntfs3
> > > overlayfs smb.


> > inotify02 does not run on all_filesystems, only on the main test fs,
> > which is not very often one of the above.

> > This is how I averted the problem in fanotify16 (which does run on
> > all_filesystems):

> > commit 9062824a70b8da74aa5b1db08710d0018b48072e
> > Author: Amir Goldstein <amir73il@gmail.com>
> > Date:   Tue Nov 21 12:52:47 2023 +0200

> >     fanotify16: Fix test failure on FUSE

> >     Split SAFE_CREAT() into explicit SAFE_MKNOD() and SAFE_OPEN(),
> >     because with atomic open (e.g. fuse), SAFE_CREAT() generates FAN_OPEN
> >     before FAN_CREATE (tested with ntfs-3g), which is inconsistent with
> >     the order of events expected from other filesystems.

> > inotify02 should be fixed similarly.

> Alternately - maybe the kernel should be fixed to always get the order
> right.
> I have a patch.  I'll post it separately.

@Amir, if later Neil's branch get merged, maybe we should use on fanotify16
creat() on the old kernels (as it was in before your fix 9062824a7 ("fanotify16:
Fix test failure on FUSE")), based on kernel check.

Kind regards,
Petr

> Thanks for your confirmation that my understanding is correct!

> NeilBrown



> > I did not find any other inotify test that watches IN_CREATE.
> > I did not find any other fanotify test that watches both FAN_CREATE
> > and FAN_OPEN.

> > Thanks,
> > Amir.
Amir Goldstein June 13, 2024, 10:26 a.m. UTC | #9
On Wed, Jun 12, 2024 at 10:12 AM Petr Vorel <pvorel@suse.cz> wrote:
>
> > On Tue, 11 Jun 2024, Amir Goldstein wrote:
> > > On Tue, Jun 11, 2024 at 5:30 AM NeilBrown <neilb@suse.de> wrote:
>
> > > > On Fri, 07 Jun 2024, James Clark wrote:
>
> > > > > Hi Neil,
>
> > > > > Now that your fix is in linux-next the statvfs01 test is passing again.
> > > > > However inotify02 is still failing.
>
> > > > > This is because the test expects the IN_CREATE and IN_OPEN events to
> > > > > come in that order after calling creat(), but now they are reversed. To
> > > > > me it seems like it could be a test issue and the test should handle
> > > > > them in either order? Or maybe there should be a single inotify event
> > > > > with both flags set for the atomic open?
>
> > > > Interesting....  I don't see how any filesystem that uses ->atomic_open
> > > > would get these in the "right" order - and I do think IN_CREATE should
> > > > come before IN_OPEN.
>
> > > Correct.
>
>
> > > > Does NFSv4 pass this test?
>
> > > Probably not.
>
>
> > > > IN_OPEN is generated (by fsnotify_open()) when finish_open() is called,
> > > > which must be (and is) called by all atomic_open functions.
> > > > IN_CREATE is generated (by fsnotify_create()) when open_last_lookups()
> > > > detects that FMODE_CREATE was set and that happens *after* lookup_open()
> > > > is called, which calls atomic_open().
>
> > > > For filesystems that don't use atomic_open, the IN_OPEN is generated by
> > > > the call to do_open() which happens *after* open_last_lookups(), not
> > > > inside it.
>
> > > Correct.
>
>
> > > > So the ltp test must already fail for NFSv4, 9p ceph fuse gfs2 ntfs3
> > > > overlayfs smb.
>
>
> > > inotify02 does not run on all_filesystems, only on the main test fs,
> > > which is not very often one of the above.
>
> > > This is how I averted the problem in fanotify16 (which does run on
> > > all_filesystems):
>
> > > commit 9062824a70b8da74aa5b1db08710d0018b48072e
> > > Author: Amir Goldstein <amir73il@gmail.com>
> > > Date:   Tue Nov 21 12:52:47 2023 +0200
>
> > >     fanotify16: Fix test failure on FUSE
>
> > >     Split SAFE_CREAT() into explicit SAFE_MKNOD() and SAFE_OPEN(),
> > >     because with atomic open (e.g. fuse), SAFE_CREAT() generates FAN_OPEN
> > >     before FAN_CREATE (tested with ntfs-3g), which is inconsistent with
> > >     the order of events expected from other filesystems.
>
> > > inotify02 should be fixed similarly.
>
> > Alternately - maybe the kernel should be fixed to always get the order
> > right.
> > I have a patch.  I'll post it separately.
>
> @Amir, if later Neil's branch get merged, maybe we should use on fanotify16
> creat() on the old kernels (as it was in before your fix 9062824a7 ("fanotify16:
> Fix test failure on FUSE")), based on kernel check.
>

I am hoping that the fix could be backported to v6.9.y and then we
won't need to worry about older LTS kernels for fanotify16, because
fuse only supports FAN_CREATE since v6.8.

Thanks,
Amir.
Petr Vorel June 14, 2024, 8:18 a.m. UTC | #10
> On Wed, Jun 12, 2024 at 10:12 AM Petr Vorel <pvorel@suse.cz> wrote:

> > > On Tue, 11 Jun 2024, Amir Goldstein wrote:
> > > > On Tue, Jun 11, 2024 at 5:30 AM NeilBrown <neilb@suse.de> wrote:

> > > > > On Fri, 07 Jun 2024, James Clark wrote:

> > > > > > Hi Neil,

> > > > > > Now that your fix is in linux-next the statvfs01 test is passing again.
> > > > > > However inotify02 is still failing.

> > > > > > This is because the test expects the IN_CREATE and IN_OPEN events to
> > > > > > come in that order after calling creat(), but now they are reversed. To
> > > > > > me it seems like it could be a test issue and the test should handle
> > > > > > them in either order? Or maybe there should be a single inotify event
> > > > > > with both flags set for the atomic open?

> > > > > Interesting....  I don't see how any filesystem that uses ->atomic_open
> > > > > would get these in the "right" order - and I do think IN_CREATE should
> > > > > come before IN_OPEN.

> > > > Correct.


> > > > > Does NFSv4 pass this test?

> > > > Probably not.


> > > > > IN_OPEN is generated (by fsnotify_open()) when finish_open() is called,
> > > > > which must be (and is) called by all atomic_open functions.
> > > > > IN_CREATE is generated (by fsnotify_create()) when open_last_lookups()
> > > > > detects that FMODE_CREATE was set and that happens *after* lookup_open()
> > > > > is called, which calls atomic_open().

> > > > > For filesystems that don't use atomic_open, the IN_OPEN is generated by
> > > > > the call to do_open() which happens *after* open_last_lookups(), not
> > > > > inside it.

> > > > Correct.


> > > > > So the ltp test must already fail for NFSv4, 9p ceph fuse gfs2 ntfs3
> > > > > overlayfs smb.


> > > > inotify02 does not run on all_filesystems, only on the main test fs,
> > > > which is not very often one of the above.

> > > > This is how I averted the problem in fanotify16 (which does run on
> > > > all_filesystems):

> > > > commit 9062824a70b8da74aa5b1db08710d0018b48072e
> > > > Author: Amir Goldstein <amir73il@gmail.com>
> > > > Date:   Tue Nov 21 12:52:47 2023 +0200

> > > >     fanotify16: Fix test failure on FUSE

> > > >     Split SAFE_CREAT() into explicit SAFE_MKNOD() and SAFE_OPEN(),
> > > >     because with atomic open (e.g. fuse), SAFE_CREAT() generates FAN_OPEN
> > > >     before FAN_CREATE (tested with ntfs-3g), which is inconsistent with
> > > >     the order of events expected from other filesystems.

> > > > inotify02 should be fixed similarly.

> > > Alternately - maybe the kernel should be fixed to always get the order
> > > right.
> > > I have a patch.  I'll post it separately.

> > @Amir, if later Neil's branch get merged, maybe we should use on fanotify16
> > creat() on the old kernels (as it was in before your fix 9062824a7 ("fanotify16:
> > Fix test failure on FUSE")), based on kernel check.


> I am hoping that the fix could be backported to v6.9.y and then we
> won't need to worry about older LTS kernels for fanotify16, because
> fuse only supports FAN_CREATE since v6.8.

Great! Thanks for info.

Kind regards,
Petr

> Thanks,
> Amir.
diff mbox series

Patch

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ac505671efbd..342930996226 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -56,6 +56,8 @@  static int nfs_readdir(struct file *, struct dir_context *);
 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 static void nfs_readdir_clear_array(struct folio *);
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, int open_flags);
 
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
@@ -2243,6 +2245,41 @@  static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 
 #endif /* CONFIG_NFSV4 */
 
+int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
+			struct file *file, unsigned int open_flags,
+			umode_t mode)
+{
+
+	/* Same as look+open from lookup_open(), but with different O_TRUNC
+	 * handling.
+	 */
+	int error = 0;
+
+	if (open_flags & O_CREAT) {
+		file->f_mode |= FMODE_CREATED;
+		error = nfs_do_create(dir, dentry, mode, open_flags);
+		if (error)
+			return error;
+		return finish_open(file, dentry, NULL);
+	} else if (d_in_lookup(dentry)) {
+		/* The only flags nfs_lookup considers are
+		 * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
+		 * we want those to be zero so the lookup isn't skipped.
+		 */
+		struct dentry *res = nfs_lookup(dir, dentry, 0);
+
+		d_lookup_done(dentry);
+		if (unlikely(res)) {
+			if (IS_ERR(res))
+				return PTR_ERR(res);
+			return finish_no_open(file, res);
+		}
+	}
+	return finish_no_open(file, NULL);
+
+}
+EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
+
 struct dentry *
 nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
 				struct nfs_fattr *fattr)
@@ -2303,18 +2340,23 @@  EXPORT_SYMBOL_GPL(nfs_instantiate);
  * that the operation succeeded on the server, but an error in the
  * reply path made it appear to have failed.
  */
-int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
-	       struct dentry *dentry, umode_t mode, bool excl)
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, int open_flags)
 {
 	struct iattr attr;
-	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
 	int error;
 
+	open_flags |= O_CREAT;
+
 	dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
+	if (open_flags & O_TRUNC) {
+		attr.ia_size = 0;
+		attr.ia_valid |= ATTR_SIZE;
+	}
 
 	trace_nfs_create_enter(dir, dentry, open_flags);
 	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
@@ -2326,6 +2368,12 @@  int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	d_drop(dentry);
 	return error;
 }
+
+int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, bool excl)
+{
+	return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0);
+}
 EXPORT_SYMBOL_GPL(nfs_create);
 
 /*
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cbbe3f0193b8..74bda639a7cf 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -986,6 +986,7 @@  static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
 
 static const struct inode_operations nfs3_dir_inode_operations = {
 	.create		= nfs_create,
+	.atomic_open	= nfs_atomic_open_v23,
 	.lookup		= nfs_lookup,
 	.link		= nfs_link,
 	.unlink		= nfs_unlink,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ad3a321ae997..d105e5b2659d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -695,6 +695,7 @@  static int nfs_have_delegation(struct inode *inode, fmode_t flags)
 static const struct inode_operations nfs_dir_inode_operations = {
 	.create		= nfs_create,
 	.lookup		= nfs_lookup,
+	.atomic_open	= nfs_atomic_open_v23,
 	.link		= nfs_link,
 	.unlink		= nfs_unlink,
 	.symlink	= nfs_symlink,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d59116ac8209..039898d70954 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -561,6 +561,9 @@  extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openfl
 extern void nfs_access_zap_cache(struct inode *inode);
 extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred,
 				 u32 *mask, bool may_block);
+extern int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
+			       struct file *file, unsigned int open_flags,
+			       umode_t mode);
 
 /*
  * linux/fs/nfs/symlink.c