diff mbox series

[v10,2/8] fuse: introduce atomic open

Message ID 20231023183035.11035-3-bschubert@ddn.com (mailing list archive)
State New, archived
Headers show
Series fuse: full atomic open and atomic-open-revalidate | expand

Commit Message

Bernd Schubert Oct. 23, 2023, 6:30 p.m. UTC
From: Dharmendra Singh <dsingh@ddn.com>

This adds full atomic open support, to avoid lookup before open/create.
If the implementation (fuse server/daemon) does not support atomic open
it falls back to non-atomic open.

Co-developed-by: Bernd Schubert <bschubert@ddn.com>
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dharmendra Singh <dsingh@ddn.com>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/fuse/dir.c             | 214 +++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h          |   3 +
 include/uapi/linux/fuse.h |   3 +
 3 files changed, 219 insertions(+), 1 deletion(-)

Comments

Yuan Yao Oct. 24, 2023, 10:12 a.m. UTC | #1
Thank you for addressing the symbolic link problems!
Additionally, I found an incompatibility with the no_open feature.
When the FUSE server has the no_open feature enabled, the atomic_open
FUSE request returns a d_entry with an empty file handler and open
option FOPEN_KEEP_CACHE (for files) or FOPEN_CACHE_DIR (for
directories). With this situation, if we can set fc->no_open = 1 or
fc->no_opendir = 1 after receiving the such FUSE reply, as follows,
will make the atomic_open compatible with no_open feature:
+       if (!inode) {
+               flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
+               fuse_sync_release(NULL, ff, flags);
+               fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
+               err = -ENOMEM;
+               goto out_err;
+       }
+
+ if(ff->fh == 0) {
+        if(ff->open_flags & FOPEN_KEEP_CACHE)
+            fc->no_open = 1;
+        if(ff->open_flags & FOPEN_CACHE_DIR)
+          fc->no_opendir = 1;
+       }
+
+       /* prevent racing/parallel lookup on a negative hashed */


On Tue, Oct 24, 2023 at 3:41 AM Bernd Schubert <bschubert@ddn.com> wrote:
>
> From: Dharmendra Singh <dsingh@ddn.com>
>
> This adds full atomic open support, to avoid lookup before open/create.
> If the implementation (fuse server/daemon) does not support atomic open
> it falls back to non-atomic open.
>
> Co-developed-by: Bernd Schubert <bschubert@ddn.com>
> Signed-off-by: Bernd Schubert <bschubert@ddn.com>
> Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
> Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
> Cc: Miklos Szeredi <miklos@szeredi.hu>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Dharmendra Singh <dsingh@ddn.com>
> Cc: linux-fsdevel@vger.kernel.org
> ---
>  fs/fuse/dir.c             | 214 +++++++++++++++++++++++++++++++++++++-
>  fs/fuse/fuse_i.h          |   3 +
>  include/uapi/linux/fuse.h |   3 +
>  3 files changed, 219 insertions(+), 1 deletion(-)
>
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index e1095852601c..61cdb8e5f68e 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -716,7 +716,7 @@ static int _fuse_create_open(struct inode *dir, struct dentry *entry,
>
>  static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
>                       umode_t, dev_t);
> -static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
> +static int fuse_create_open(struct inode *dir, struct dentry *entry,
>                             struct file *file, unsigned flags,
>                             umode_t mode)
>  {
> @@ -763,6 +763,218 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
>         return finish_no_open(file, res);
>  }
>
> +static int _fuse_atomic_open(struct inode *dir, struct dentry *entry,
> +                            struct file *file, unsigned int flags,
> +                            umode_t mode)
> +{
> +       int err;
> +       struct inode *inode;
> +       FUSE_ARGS(args);
> +       struct fuse_mount *fm = get_fuse_mount(dir);
> +       struct fuse_conn *fc = fm->fc;
> +       struct fuse_forget_link *forget;
> +       struct fuse_create_in inarg;
> +       struct fuse_open_out outopen;
> +       struct fuse_entry_out outentry;
> +       struct fuse_inode *fi;
> +       struct fuse_file *ff;
> +       struct dentry *switched_entry = NULL, *alias = NULL;
> +       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
> +
> +       /* Expect a negative dentry */
> +       if (unlikely(d_inode(entry)))
> +               goto fallback;
> +
> +       /* Userspace expects S_IFREG in create mode */
> +       if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG)
> +               goto fallback;
> +
> +       forget = fuse_alloc_forget();
> +       err = -ENOMEM;
> +       if (!forget)
> +               goto out_err;
> +
> +       err = -ENOMEM;
> +       ff = fuse_file_alloc(fm);
> +       if (!ff)
> +               goto out_put_forget_req;
> +
> +       if (!fc->dont_mask)
> +               mode &= ~current_umask();
> +
> +       flags &= ~O_NOCTTY;
> +       memset(&inarg, 0, sizeof(inarg));
> +       memset(&outentry, 0, sizeof(outentry));
> +       inarg.flags = flags;
> +       inarg.mode = mode;
> +       inarg.umask = current_umask();
> +
> +       if (fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
> +           !(flags & O_EXCL) && !capable(CAP_FSETID)) {
> +               inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
> +       }
> +
> +       args.opcode = FUSE_OPEN_ATOMIC;
> +       args.nodeid = get_node_id(dir);
> +       args.in_numargs = 2;
> +       args.in_args[0].size = sizeof(inarg);
> +       args.in_args[0].value = &inarg;
> +       args.in_args[1].size = entry->d_name.len + 1;
> +       args.in_args[1].value = entry->d_name.name;
> +       args.out_numargs = 2;
> +       args.out_args[0].size = sizeof(outentry);
> +       args.out_args[0].value = &outentry;
> +       args.out_args[1].size = sizeof(outopen);
> +       args.out_args[1].value = &outopen;
> +
> +       if (flags & O_CREAT) {
> +               err = get_create_ext(&args, dir, entry, mode);
> +               if (err)
> +                       goto out_free_ff;
> +       }
> +
> +       err = fuse_simple_request(fm, &args);
> +       free_ext_value(&args);
> +       if (err == -ENOSYS || err == -ELOOP) {
> +               if (unlikely(err == -ENOSYS))
> +                       fc->no_open_atomic = 1;
> +               goto free_and_fallback;
> +       }
> +
> +       if (!err && !outentry.nodeid)
> +               err = -ENOENT;
> +
> +       if (err)
> +               goto out_free_ff;
> +
> +       err = -EIO;
> +       if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr))
> +               goto out_free_ff;
> +
> +       ff->fh = outopen.fh;
> +       ff->nodeid = outentry.nodeid;
> +       ff->open_flags = outopen.open_flags;
> +       inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
> +                         &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
> +       if (!inode) {
> +               flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
> +               fuse_sync_release(NULL, ff, flags);
> +               fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
> +               err = -ENOMEM;
> +               goto out_err;
> +       }
> +
> +       /* prevent racing/parallel lookup on a negative hashed */
> +       if (!(flags & O_CREAT) && !d_in_lookup(entry)) {
> +               d_drop(entry);
> +               switched_entry = d_alloc_parallel(entry->d_parent,
> +                                                  &entry->d_name, &wq);
> +               if (IS_ERR(switched_entry)) {
> +                       err = PTR_ERR(switched_entry);
> +                       switched_entry = NULL;
> +                       goto out_free_ff;
> +               }
> +
> +               if (unlikely(!d_in_lookup(switched_entry))) {
> +                       /* fall back */
> +                       dput(switched_entry);
> +                       switched_entry = NULL;
> +                       goto free_and_fallback;
> +               }
> +
> +               entry = switched_entry;
> +       }
> +
> +       if (d_really_is_negative(entry)) {
> +               d_drop(entry);
> +               alias = d_exact_alias(entry, inode);
> +               if (!alias) {
> +                       alias = d_splice_alias(inode, entry);
> +                       if (IS_ERR(alias)) {
> +                               /*
> +                                * Close the file in user space, but do not unlink it,
> +                                * if it was created - with network file systems other
> +                                * clients might have already accessed it.
> +                                */
> +                               fi = get_fuse_inode(inode);
> +                               fuse_sync_release(fi, ff, flags);
> +                               fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
> +                               err = PTR_ERR(alias);
> +                               goto out_err;
> +                       }
> +               }
> +
> +               if (alias)
> +                       entry = alias;
> +       }
> +
> +       fuse_change_entry_timeout(entry, &outentry);
> +
> +       /*  File was indeed created */
> +       if (outopen.open_flags & FOPEN_FILE_CREATED) {
> +               if (!(flags & O_CREAT)) {
> +                       pr_debug("Server side bug, FOPEN_FILE_CREATED set "
> +                                "without O_CREAT, ignoring.");
> +               } else {
> +                       /* This should be always set when the file is created */
> +                       fuse_dir_changed(dir);
> +                       file->f_mode |= FMODE_CREATED;
> +               }
> +       }
> +
> +       if (S_ISDIR(mode))
> +               ff->open_flags &= ~FOPEN_DIRECT_IO;
> +       err = finish_open(file, entry, generic_file_open);
> +       if (err) {
> +               fi = get_fuse_inode(inode);
> +               fuse_sync_release(fi, ff, flags);
> +       } else {
> +               file->private_data = ff;
> +               fuse_finish_open(inode, file);
> +       }
> +
> +       kfree(forget);
> +
> +       if (switched_entry) {
> +               d_lookup_done(switched_entry);
> +               dput(switched_entry);
> +       }
> +
> +       dput(alias);
> +
> +       return err;
> +
> +out_free_ff:
> +       fuse_file_free(ff);
> +out_put_forget_req:
> +       kfree(forget);
> +out_err:
> +       if (switched_entry) {
> +               d_lookup_done(switched_entry);
> +               dput(switched_entry);
> +       }
> +
> +       return err;
> +
> +free_and_fallback:
> +       fuse_file_free(ff);
> +       kfree(forget);
> +fallback:
> +       return fuse_create_open(dir, entry, file, flags, mode);
> +}
> +
> +static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
> +                           struct file *file, unsigned int flags,
> +                           umode_t mode)
> +{
> +       struct fuse_conn *fc = get_fuse_conn(dir);
> +
> +       if (fc->no_open_atomic)
> +               return fuse_create_open(dir, entry, file, flags, mode);
> +       else
> +               return _fuse_atomic_open(dir, entry, file, flags, mode);
> +}
> +
>  /*
>   * Code shared between mknod, mkdir, symlink and link
>   */
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index bf0b85d0b95c..af69578763ef 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -677,6 +677,9 @@ struct fuse_conn {
>         /** Is open/release not implemented by fs? */
>         unsigned no_open:1;
>
> +       /** Is open atomic not implemented by fs? */
> +       unsigned no_open_atomic:1;
> +
>         /** Is opendir/releasedir not implemented by fs? */
>         unsigned no_opendir:1;
>
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index db92a7202b34..1508afbd9446 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -353,6 +353,7 @@ struct fuse_file_lock {
>   * FOPEN_STREAM: the file is stream-like (no file position at all)
>   * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
>   * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode
> + * FOPEN_FILE_CREATED: the file was indeed created
>   */
>  #define FOPEN_DIRECT_IO                (1 << 0)
>  #define FOPEN_KEEP_CACHE       (1 << 1)
> @@ -361,6 +362,7 @@ struct fuse_file_lock {
>  #define FOPEN_STREAM           (1 << 4)
>  #define FOPEN_NOFLUSH          (1 << 5)
>  #define FOPEN_PARALLEL_DIRECT_WRITES   (1 << 6)
> +#define FOPEN_FILE_CREATED     (1 << 7)
>
>  /**
>   * INIT request/reply flags
> @@ -617,6 +619,7 @@ enum fuse_opcode {
>         FUSE_SYNCFS             = 50,
>         FUSE_TMPFILE            = 51,
>         FUSE_STATX              = 52,
> +       FUSE_OPEN_ATOMIC        = 53,
>
>         /* CUSE specific operations */
>         CUSE_INIT               = 4096,
> --
> 2.39.2
>
>
Bernd Schubert Oct. 24, 2023, 12:36 p.m. UTC | #2
On 10/24/23 12:12, Yuan Yao wrote:
> Thank you for addressing the symbolic link problems!
> Additionally, I found an incompatibility with the no_open feature.
> When the FUSE server has the no_open feature enabled, the atomic_open
> FUSE request returns a d_entry with an empty file handler and open
> option FOPEN_KEEP_CACHE (for files) or FOPEN_CACHE_DIR (for
> directories). With this situation, if we can set fc->no_open = 1 or
> fc->no_opendir = 1 after receiving the such FUSE reply, as follows,
> will make the atomic_open compatible with no_open feature:
> +       if (!inode) {
> +               flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
> +               fuse_sync_release(NULL, ff, flags);
> +               fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
> +               err = -ENOMEM;
> +               goto out_err;
> +       }
> +
> + if(ff->fh == 0) {
> +        if(ff->open_flags & FOPEN_KEEP_CACHE)
> +            fc->no_open = 1;
> +        if(ff->open_flags & FOPEN_CACHE_DIR)
> +          fc->no_opendir = 1;
> +       }
> +
> +       /* prevent racing/parallel lookup on a negative hashed */
> 

Thanks again for your review!

Hmm, are you sure atomic open needs to handle no-open? fuse_file_open 
sets no-open / no-opendir on -ENOSYS. _fuse_atomic_open has a handler 
for -ENOSYS and falls back to the existing create_open. So why does 
atomic open need a no-open handling?


Thanks,
Bernd
Yuan Yao Oct. 26, 2023, 2:42 a.m. UTC | #3
Currently, if _fuse_atomic_open receives an -ENOSYS error, the
no_open_atomic flag is set, preventing further use of atomic_open.
However, even with the no_open feature enabled, atomic_open can still
provide performance benefits when creating new files due to its
ability to combine FUSE lookup and FUSE create operations into a
single atomic request. Therefore, I think it would be advantageous to
allow these two features to coexist.


Thanks,
Yuan

On Tue, Oct 24, 2023 at 9:36 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 10/24/23 12:12, Yuan Yao wrote:
> > Thank you for addressing the symbolic link problems!
> > Additionally, I found an incompatibility with the no_open feature.
> > When the FUSE server has the no_open feature enabled, the atomic_open
> > FUSE request returns a d_entry with an empty file handler and open
> > option FOPEN_KEEP_CACHE (for files) or FOPEN_CACHE_DIR (for
> > directories). With this situation, if we can set fc->no_open = 1 or
> > fc->no_opendir = 1 after receiving the such FUSE reply, as follows,
> > will make the atomic_open compatible with no_open feature:
> > +       if (!inode) {
> > +               flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
> > +               fuse_sync_release(NULL, ff, flags);
> > +               fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
> > +               err = -ENOMEM;
> > +               goto out_err;
> > +       }
> > +
> > + if(ff->fh == 0) {
> > +        if(ff->open_flags & FOPEN_KEEP_CACHE)
> > +            fc->no_open = 1;
> > +        if(ff->open_flags & FOPEN_CACHE_DIR)
> > +          fc->no_opendir = 1;
> > +       }
> > +
> > +       /* prevent racing/parallel lookup on a negative hashed */
> >
>
> Thanks again for your review!
>
> Hmm, are you sure atomic open needs to handle no-open? fuse_file_open
> sets no-open / no-opendir on -ENOSYS. _fuse_atomic_open has a handler
> for -ENOSYS and falls back to the existing create_open. So why does
> atomic open need a no-open handling?
>
>
> Thanks,
> Bernd
>
>
>
>
Al Viro Oct. 28, 2023, 3:03 a.m. UTC | #4
On Mon, Oct 23, 2023 at 08:30:29PM +0200, Bernd Schubert wrote:
> +{
> +	int err;
> +	struct inode *inode;
> +	FUSE_ARGS(args);
> +	struct fuse_mount *fm = get_fuse_mount(dir);
> +	struct fuse_conn *fc = fm->fc;
> +	struct fuse_forget_link *forget;
> +	struct fuse_create_in inarg;
> +	struct fuse_open_out outopen;
> +	struct fuse_entry_out outentry;
> +	struct fuse_inode *fi;
> +	struct fuse_file *ff;
> +	struct dentry *switched_entry = NULL, *alias = NULL;
> +	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
> +
> +	/* Expect a negative dentry */
> +	if (unlikely(d_inode(entry)))
> +		goto fallback;
> +
> +	/* Userspace expects S_IFREG in create mode */
> +	if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG)
> +		goto fallback;

How could it get there with such mode?  We could check that
in fs/namei.c:atomic_open() (with
	if (WARN_ON_ONCE((open_flags & O_CREAT) && !S_ISREG(mode)))
		error = -EINVAL; // for the lack of EWTFAREYOUSMOKING
	else
		error = dir->i_op->atomic_open(....)
or something similar), but that doesn't belong in the method instances.
And it really should never happen - that thing comes from op->mode and
we have build_open_flags() doing this:
        if (WILL_CREATE(flags)) {
                if (how->mode & ~S_IALLUGO)
                        return -EINVAL;
                op->mode = how->mode | S_IFREG;
        } else {
                if (how->mode != 0)
                        return -EINVAL;
                op->mode = 0;
        }
so...  Are other instances doing the same kind of paranoia?  That BUG_ON()
in current fuse_atomic_open() is bogus (and seriously misplaced)...

> +	forget = fuse_alloc_forget();
> +	err = -ENOMEM;
> +	if (!forget)
> +		goto out_err;
> +
> +	err = -ENOMEM;
> +	ff = fuse_file_alloc(fm);
> +	if (!ff)
> +		goto out_put_forget_req;
> +
> +	if (!fc->dont_mask)
> +		mode &= ~current_umask();
> +
> +	flags &= ~O_NOCTTY;
> +	memset(&inarg, 0, sizeof(inarg));
> +	memset(&outentry, 0, sizeof(outentry));
> +	inarg.flags = flags;
> +	inarg.mode = mode;
> +	inarg.umask = current_umask();
> +
> +	if (fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
> +	    !(flags & O_EXCL) && !capable(CAP_FSETID)) {
> +		inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
> +	}
> +
> +	args.opcode = FUSE_OPEN_ATOMIC;
> +	args.nodeid = get_node_id(dir);
> +	args.in_numargs = 2;
> +	args.in_args[0].size = sizeof(inarg);
> +	args.in_args[0].value = &inarg;
> +	args.in_args[1].size = entry->d_name.len + 1;
> +	args.in_args[1].value = entry->d_name.name;
> +	args.out_numargs = 2;
> +	args.out_args[0].size = sizeof(outentry);
> +	args.out_args[0].value = &outentry;
> +	args.out_args[1].size = sizeof(outopen);
> +	args.out_args[1].value = &outopen;
> +
> +	if (flags & O_CREAT) {
> +		err = get_create_ext(&args, dir, entry, mode);
> +		if (err)
> +			goto out_free_ff;
> +	}
> +
> +	err = fuse_simple_request(fm, &args);
> +	free_ext_value(&args);
> +	if (err == -ENOSYS || err == -ELOOP) {
> +		if (unlikely(err == -ENOSYS))
> +			fc->no_open_atomic = 1;
> +		goto free_and_fallback;
> +	}
> +
> +	if (!err && !outentry.nodeid)
> +		err = -ENOENT;
> +
> +	if (err)
> +		goto out_free_ff;
> +
> +	err = -EIO;
> +	if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr))
> +		goto out_free_ff;
> +
> +	ff->fh = outopen.fh;
> +	ff->nodeid = outentry.nodeid;
> +	ff->open_flags = outopen.open_flags;
> +	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
> +			  &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
> +	if (!inode) {
> +		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
> +		fuse_sync_release(NULL, ff, flags);
> +		fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
> +		err = -ENOMEM;
> +		goto out_err;
> +	}
> +
> +	/* prevent racing/parallel lookup on a negative hashed */
> +	if (!(flags & O_CREAT) && !d_in_lookup(entry)) {

... in which case it has just passed ->d_revalidate()...

> +		d_drop(entry);
> +		switched_entry = d_alloc_parallel(entry->d_parent,
> +						   &entry->d_name, &wq);
> +		if (IS_ERR(switched_entry)) {
> +			err = PTR_ERR(switched_entry);
> +			switched_entry = NULL;
> +			goto out_free_ff;

leaked inode?

> +		}
> +
> +		if (unlikely(!d_in_lookup(switched_entry))) {
> +			/* fall back */
> +			dput(switched_entry);
> +			switched_entry = NULL;
> +			goto free_and_fallback;

ditto, and I don't really understand what the hell is going on with
dentry references here.  What is the intended behaviour in that case?

> +		}
> +
> +		entry = switched_entry;
> +	}
> +
> +	if (d_really_is_negative(entry)) {
> +		d_drop(entry);
> +		alias = d_exact_alias(entry, inode);

What case is that about?  "We have an unhashed positive dentry with that
exact name, parent and inode"?  Where would it have come from?

Another thing: this does not consume an inode reference, no matter what
gets returned,

> +		if (!alias) {
> +			alias = d_splice_alias(inode, entry);

but that one *does* consume the inode reference; note the igrab() in
nfs4 code where you've nicked that from...

> +			if (IS_ERR(alias)) {
> +				/*
> +				 * Close the file in user space, but do not unlink it,
> +				 * if it was created - with network file systems other
> +				 * clients might have already accessed it.
> +				 */
> +				fi = get_fuse_inode(inode);

... so this is asking for UAF.

> +				fuse_sync_release(fi, ff, flags);
> +				fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
> +				err = PTR_ERR(alias);
> +				goto out_err;
> +			}
> +		}
> +
> +		if (alias)
> +			entry = alias;
> +	}

... and here we have no way to tell if inode needs to be dropped.

> +
> +	fuse_change_entry_timeout(entry, &outentry);
> +
> +	/*  File was indeed created */
> +	if (outopen.open_flags & FOPEN_FILE_CREATED) {
> +		if (!(flags & O_CREAT)) {
> +			pr_debug("Server side bug, FOPEN_FILE_CREATED set "
> +				 "without O_CREAT, ignoring.");
> +		} else {
> +			/* This should be always set when the file is created */
> +			fuse_dir_changed(dir);
> +			file->f_mode |= FMODE_CREATED;
> +		}
> +	}
> +
> +	if (S_ISDIR(mode))
> +		ff->open_flags &= ~FOPEN_DIRECT_IO;
> +	err = finish_open(file, entry, generic_file_open);
> +	if (err) {
> +		fi = get_fuse_inode(inode);
> +		fuse_sync_release(fi, ff, flags);
> +	} else {
> +		file->private_data = ff;
> +		fuse_finish_open(inode, file);
> +	}
> +
> +	kfree(forget);
> +
> +	if (switched_entry) {
> +		d_lookup_done(switched_entry);
> +		dput(switched_entry);
> +	}
> +
> +	dput(alias);
> +
> +	return err;
> +
> +out_free_ff:
> +	fuse_file_free(ff);
> +out_put_forget_req:
> +	kfree(forget);
> +out_err:
> +	if (switched_entry) {
> +		d_lookup_done(switched_entry);
> +		dput(switched_entry);
> +	}
> +
> +	return err;
> +
> +free_and_fallback:
> +	fuse_file_free(ff);
> +	kfree(forget);
> +fallback:
> +	return fuse_create_open(dir, entry, file, flags, mode);
> +}
Bernd Schubert Oct. 30, 2023, 3:21 p.m. UTC | #5
Thanks a lot for all your reviews, Al!

On 10/28/23 05:03, Al Viro wrote:
> On Mon, Oct 23, 2023 at 08:30:29PM +0200, Bernd Schubert wrote:
>> +{
>> +	int err;
>> +	struct inode *inode;
>> +	FUSE_ARGS(args);
>> +	struct fuse_mount *fm = get_fuse_mount(dir);
>> +	struct fuse_conn *fc = fm->fc;
>> +	struct fuse_forget_link *forget;
>> +	struct fuse_create_in inarg;
>> +	struct fuse_open_out outopen;
>> +	struct fuse_entry_out outentry;
>> +	struct fuse_inode *fi;
>> +	struct fuse_file *ff;
>> +	struct dentry *switched_entry = NULL, *alias = NULL;
>> +	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
>> +
>> +	/* Expect a negative dentry */
>> +	if (unlikely(d_inode(entry)))
>> +		goto fallback;
>> +
>> +	/* Userspace expects S_IFREG in create mode */
>> +	if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG)
>> +		goto fallback;
> 
> How could it get there with such mode?  We could check that
> in fs/namei.c:atomic_open() (with
> 	if (WARN_ON_ONCE((open_flags & O_CREAT) && !S_ISREG(mode)))
> 		error = -EINVAL; // for the lack of EWTFAREYOUSMOKING
> 	else
> 		error = dir->i_op->atomic_open(....)
> or something similar), but that doesn't belong in the method instances.
> And it really should never happen - that thing comes from op->mode and
> we have build_open_flags() doing this:
>          if (WILL_CREATE(flags)) {
>                  if (how->mode & ~S_IALLUGO)
>                          return -EINVAL;
>                  op->mode = how->mode | S_IFREG;
>          } else {
>                  if (how->mode != 0)
>                          return -EINVAL;
>                  op->mode = 0;
>          }
> so...  Are other instances doing the same kind of paranoia?  That BUG_ON()
> in current fuse_atomic_open() is bogus (and seriously misplaced)...

Ok, sorry, we took over the check blindly. I added another patch in the 
v11 branch to remove the BUG_ON in current fuse_atomic_open.

> 
>> +	forget = fuse_alloc_forget();
>> +	err = -ENOMEM;
>> +	if (!forget)
>> +		goto out_err;
>> +
>> +	err = -ENOMEM;
>> +	ff = fuse_file_alloc(fm);
>> +	if (!ff)
>> +		goto out_put_forget_req;
>> +
>> +	if (!fc->dont_mask)
>> +		mode &= ~current_umask();
>> +
>> +	flags &= ~O_NOCTTY;
>> +	memset(&inarg, 0, sizeof(inarg));
>> +	memset(&outentry, 0, sizeof(outentry));
>> +	inarg.flags = flags;
>> +	inarg.mode = mode;
>> +	inarg.umask = current_umask();
>> +
>> +	if (fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
>> +	    !(flags & O_EXCL) && !capable(CAP_FSETID)) {
>> +		inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
>> +	}
>> +
>> +	args.opcode = FUSE_OPEN_ATOMIC;
>> +	args.nodeid = get_node_id(dir);
>> +	args.in_numargs = 2;
>> +	args.in_args[0].size = sizeof(inarg);
>> +	args.in_args[0].value = &inarg;
>> +	args.in_args[1].size = entry->d_name.len + 1;
>> +	args.in_args[1].value = entry->d_name.name;
>> +	args.out_numargs = 2;
>> +	args.out_args[0].size = sizeof(outentry);
>> +	args.out_args[0].value = &outentry;
>> +	args.out_args[1].size = sizeof(outopen);
>> +	args.out_args[1].value = &outopen;
>> +
>> +	if (flags & O_CREAT) {
>> +		err = get_create_ext(&args, dir, entry, mode);
>> +		if (err)
>> +			goto out_free_ff;
>> +	}
>> +
>> +	err = fuse_simple_request(fm, &args);
>> +	free_ext_value(&args);
>> +	if (err == -ENOSYS || err == -ELOOP) {
>> +		if (unlikely(err == -ENOSYS))
>> +			fc->no_open_atomic = 1;
>> +		goto free_and_fallback;
>> +	}
>> +
>> +	if (!err && !outentry.nodeid)
>> +		err = -ENOENT;
>> +
>> +	if (err)
>> +		goto out_free_ff;
>> +
>> +	err = -EIO;
>> +	if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr))
>> +		goto out_free_ff;
>> +
>> +	ff->fh = outopen.fh;
>> +	ff->nodeid = outentry.nodeid;
>> +	ff->open_flags = outopen.open_flags;
>> +	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
>> +			  &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
>> +	if (!inode) {
>> +		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
>> +		fuse_sync_release(NULL, ff, flags);
>> +		fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
>> +		err = -ENOMEM;
>> +		goto out_err;
>> +	}
>> +
>> +	/* prevent racing/parallel lookup on a negative hashed */
>> +	if (!(flags & O_CREAT) && !d_in_lookup(entry)) {
> 
> ... in which case it has just passed ->d_revalidate()...

With the follow up patches that revalidate is going to be moved to this 
function.
Is there anything here that needs to be improved? I had added that check 
after looking through the other atomic_open methods and then noticed 
your commit c94c09535c4d:
nfs_atomic_open(): prevent parallel nfs_lookup() on a negative hashed


> 
>> +		d_drop(entry);
>> +		switched_entry = d_alloc_parallel(entry->d_parent,
>> +						   &entry->d_name, &wq);
>> +		if (IS_ERR(switched_entry)) {
>> +			err = PTR_ERR(switched_entry);
>> +			switched_entry = NULL;
>> +			goto out_free_ff;
> 
> leaked inode?

Yikes, silly me and with that also leaked fuse userspace inode.

> 
>> +		}
>> +
>> +		if (unlikely(!d_in_lookup(switched_entry))) {
>> +			/* fall back */
>> +			dput(switched_entry);
>> +			switched_entry = NULL;
>> +			goto free_and_fallback;
> 
> ditto, and I don't really understand what the hell is going on with
> dentry references here.  What is the intended behaviour in that case?


The idea was to give up 'switched_entry' and let the existing 
fuse_create_open do the fallback work. Hmm, yeah, already called 
d_drop(dentry). And it also already did the userspace part - fallback 
without fuse-forget is totally wrong.
I guess I need severe patch update because of this - the other parts are 
not difficult, but here it gets complex.

> 
>> +		}
>> +
>> +		entry = switched_entry;
>> +	}
>> +
>> +	if (d_really_is_negative(entry)) {
>> +		d_drop(entry);
>> +		alias = d_exact_alias(entry, inode);
> 
> What case is that about?  "We have an unhashed positive dentry with that
> exact name, parent and inode"?  Where would it have come from?

Sorry, taken from _nfs4_open_and_get_state and I wasn't sure if it is 
needed. A bit lame excuse, but NFS is the only other file system I found 
that handles !O_CREAT. I definitely should have marked it with something 
like /* XXX is this really needed, from _nfs4_open_and_get_state */

> 
> Another thing: this does not consume an inode reference, no matter what
> gets returned,
> 
>> +		if (!alias) {
>> +			alias = d_splice_alias(inode, entry);
> 
> but that one *does* consume the inode reference; note the igrab() in
> nfs4 code where you've nicked that from...
> 
>> +			if (IS_ERR(alias)) {
>> +				/*
>> +				 * Close the file in user space, but do not unlink it,
>> +				 * if it was created - with network file systems other
>> +				 * clients might have already accessed it.
>> +				 */
>> +				fi = get_fuse_inode(inode);
> 
> ... so this is asking for UAF.

Yeah, and staring at it again, the below fuse_queue_forget is not right 
either, as that is already handled through the inode reference / 
->evict_inode.

> 
>> +				fuse_sync_release(fi, ff, flags);
>> +				fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
>> +				err = PTR_ERR(alias);
>> +				goto out_err;
>> +			}
>> +		}
>> +
>> +		if (alias)
>> +			entry = alias;
>> +	}
> 
> ... and here we have no way to tell if inode needs to be dropped.

I guess I could solve this with another variable, but maybe there is a 
more beautiful way. I first need to think about the 
!d_in_lookup(switched_entry).


Thanks again so much for your reviews,
Bernd
diff mbox series

Patch

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e1095852601c..61cdb8e5f68e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -716,7 +716,7 @@  static int _fuse_create_open(struct inode *dir, struct dentry *entry,
 
 static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
 		      umode_t, dev_t);
-static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
+static int fuse_create_open(struct inode *dir, struct dentry *entry,
 			    struct file *file, unsigned flags,
 			    umode_t mode)
 {
@@ -763,6 +763,218 @@  static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
 	return finish_no_open(file, res);
 }
 
+static int _fuse_atomic_open(struct inode *dir, struct dentry *entry,
+			     struct file *file, unsigned int flags,
+			     umode_t mode)
+{
+	int err;
+	struct inode *inode;
+	FUSE_ARGS(args);
+	struct fuse_mount *fm = get_fuse_mount(dir);
+	struct fuse_conn *fc = fm->fc;
+	struct fuse_forget_link *forget;
+	struct fuse_create_in inarg;
+	struct fuse_open_out outopen;
+	struct fuse_entry_out outentry;
+	struct fuse_inode *fi;
+	struct fuse_file *ff;
+	struct dentry *switched_entry = NULL, *alias = NULL;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+	/* Expect a negative dentry */
+	if (unlikely(d_inode(entry)))
+		goto fallback;
+
+	/* Userspace expects S_IFREG in create mode */
+	if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG)
+		goto fallback;
+
+	forget = fuse_alloc_forget();
+	err = -ENOMEM;
+	if (!forget)
+		goto out_err;
+
+	err = -ENOMEM;
+	ff = fuse_file_alloc(fm);
+	if (!ff)
+		goto out_put_forget_req;
+
+	if (!fc->dont_mask)
+		mode &= ~current_umask();
+
+	flags &= ~O_NOCTTY;
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outentry, 0, sizeof(outentry));
+	inarg.flags = flags;
+	inarg.mode = mode;
+	inarg.umask = current_umask();
+
+	if (fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
+	    !(flags & O_EXCL) && !capable(CAP_FSETID)) {
+		inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
+	}
+
+	args.opcode = FUSE_OPEN_ATOMIC;
+	args.nodeid = get_node_id(dir);
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
+	args.out_numargs = 2;
+	args.out_args[0].size = sizeof(outentry);
+	args.out_args[0].value = &outentry;
+	args.out_args[1].size = sizeof(outopen);
+	args.out_args[1].value = &outopen;
+
+	if (flags & O_CREAT) {
+		err = get_create_ext(&args, dir, entry, mode);
+		if (err)
+			goto out_free_ff;
+	}
+
+	err = fuse_simple_request(fm, &args);
+	free_ext_value(&args);
+	if (err == -ENOSYS || err == -ELOOP) {
+		if (unlikely(err == -ENOSYS))
+			fc->no_open_atomic = 1;
+		goto free_and_fallback;
+	}
+
+	if (!err && !outentry.nodeid)
+		err = -ENOENT;
+
+	if (err)
+		goto out_free_ff;
+
+	err = -EIO;
+	if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr))
+		goto out_free_ff;
+
+	ff->fh = outopen.fh;
+	ff->nodeid = outentry.nodeid;
+	ff->open_flags = outopen.open_flags;
+	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
+			  &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
+	if (!inode) {
+		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
+		fuse_sync_release(NULL, ff, flags);
+		fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	/* prevent racing/parallel lookup on a negative hashed */
+	if (!(flags & O_CREAT) && !d_in_lookup(entry)) {
+		d_drop(entry);
+		switched_entry = d_alloc_parallel(entry->d_parent,
+						   &entry->d_name, &wq);
+		if (IS_ERR(switched_entry)) {
+			err = PTR_ERR(switched_entry);
+			switched_entry = NULL;
+			goto out_free_ff;
+		}
+
+		if (unlikely(!d_in_lookup(switched_entry))) {
+			/* fall back */
+			dput(switched_entry);
+			switched_entry = NULL;
+			goto free_and_fallback;
+		}
+
+		entry = switched_entry;
+	}
+
+	if (d_really_is_negative(entry)) {
+		d_drop(entry);
+		alias = d_exact_alias(entry, inode);
+		if (!alias) {
+			alias = d_splice_alias(inode, entry);
+			if (IS_ERR(alias)) {
+				/*
+				 * Close the file in user space, but do not unlink it,
+				 * if it was created - with network file systems other
+				 * clients might have already accessed it.
+				 */
+				fi = get_fuse_inode(inode);
+				fuse_sync_release(fi, ff, flags);
+				fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
+				err = PTR_ERR(alias);
+				goto out_err;
+			}
+		}
+
+		if (alias)
+			entry = alias;
+	}
+
+	fuse_change_entry_timeout(entry, &outentry);
+
+	/*  File was indeed created */
+	if (outopen.open_flags & FOPEN_FILE_CREATED) {
+		if (!(flags & O_CREAT)) {
+			pr_debug("Server side bug, FOPEN_FILE_CREATED set "
+				 "without O_CREAT, ignoring.");
+		} else {
+			/* This should be always set when the file is created */
+			fuse_dir_changed(dir);
+			file->f_mode |= FMODE_CREATED;
+		}
+	}
+
+	if (S_ISDIR(mode))
+		ff->open_flags &= ~FOPEN_DIRECT_IO;
+	err = finish_open(file, entry, generic_file_open);
+	if (err) {
+		fi = get_fuse_inode(inode);
+		fuse_sync_release(fi, ff, flags);
+	} else {
+		file->private_data = ff;
+		fuse_finish_open(inode, file);
+	}
+
+	kfree(forget);
+
+	if (switched_entry) {
+		d_lookup_done(switched_entry);
+		dput(switched_entry);
+	}
+
+	dput(alias);
+
+	return err;
+
+out_free_ff:
+	fuse_file_free(ff);
+out_put_forget_req:
+	kfree(forget);
+out_err:
+	if (switched_entry) {
+		d_lookup_done(switched_entry);
+		dput(switched_entry);
+	}
+
+	return err;
+
+free_and_fallback:
+	fuse_file_free(ff);
+	kfree(forget);
+fallback:
+	return fuse_create_open(dir, entry, file, flags, mode);
+}
+
+static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
+			    struct file *file, unsigned int flags,
+			    umode_t mode)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+
+	if (fc->no_open_atomic)
+		return fuse_create_open(dir, entry, file, flags, mode);
+	else
+		return _fuse_atomic_open(dir, entry, file, flags, mode);
+}
+
 /*
  * Code shared between mknod, mkdir, symlink and link
  */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bf0b85d0b95c..af69578763ef 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -677,6 +677,9 @@  struct fuse_conn {
 	/** Is open/release not implemented by fs? */
 	unsigned no_open:1;
 
+	/** Is open atomic not implemented by fs? */
+	unsigned no_open_atomic:1;
+
 	/** Is opendir/releasedir not implemented by fs? */
 	unsigned no_opendir:1;
 
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index db92a7202b34..1508afbd9446 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -353,6 +353,7 @@  struct fuse_file_lock {
  * FOPEN_STREAM: the file is stream-like (no file position at all)
  * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
  * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode
+ * FOPEN_FILE_CREATED: the file was indeed created
  */
 #define FOPEN_DIRECT_IO		(1 << 0)
 #define FOPEN_KEEP_CACHE	(1 << 1)
@@ -361,6 +362,7 @@  struct fuse_file_lock {
 #define FOPEN_STREAM		(1 << 4)
 #define FOPEN_NOFLUSH		(1 << 5)
 #define FOPEN_PARALLEL_DIRECT_WRITES	(1 << 6)
+#define FOPEN_FILE_CREATED	(1 << 7)
 
 /**
  * INIT request/reply flags
@@ -617,6 +619,7 @@  enum fuse_opcode {
 	FUSE_SYNCFS		= 50,
 	FUSE_TMPFILE		= 51,
 	FUSE_STATX		= 52,
+	FUSE_OPEN_ATOMIC	= 53,
 
 	/* CUSE specific operations */
 	CUSE_INIT		= 4096,