Message ID | 20231023183035.11035-3-bschubert@ddn.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fuse: full atomic open and atomic-open-revalidate | expand |
Thank you for addressing the symbolic link problems! Additionally, I found an incompatibility with the no_open feature. When the FUSE server has the no_open feature enabled, the atomic_open FUSE request returns a d_entry with an empty file handler and open option FOPEN_KEEP_CACHE (for files) or FOPEN_CACHE_DIR (for directories). With this situation, if we can set fc->no_open = 1 or fc->no_opendir = 1 after receiving the such FUSE reply, as follows, will make the atomic_open compatible with no_open feature: + if (!inode) { + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); + fuse_sync_release(NULL, ff, flags); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); + err = -ENOMEM; + goto out_err; + } + + if(ff->fh == 0) { + if(ff->open_flags & FOPEN_KEEP_CACHE) + fc->no_open = 1; + if(ff->open_flags & FOPEN_CACHE_DIR) + fc->no_opendir = 1; + } + + /* prevent racing/parallel lookup on a negative hashed */ On Tue, Oct 24, 2023 at 3:41 AM Bernd Schubert <bschubert@ddn.com> wrote: > > From: Dharmendra Singh <dsingh@ddn.com> > > This adds full atomic open support, to avoid lookup before open/create. > If the implementation (fuse server/daemon) does not support atomic open > it falls back to non-atomic open. > > Co-developed-by: Bernd Schubert <bschubert@ddn.com> > Signed-off-by: Bernd Schubert <bschubert@ddn.com> > Signed-off-by: Dharmendra Singh <dsingh@ddn.com> > Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com> > Cc: Miklos Szeredi <miklos@szeredi.hu> > Cc: Christian Brauner <brauner@kernel.org> > Cc: Al Viro <viro@zeniv.linux.org.uk> > Cc: Dharmendra Singh <dsingh@ddn.com> > Cc: linux-fsdevel@vger.kernel.org > --- > fs/fuse/dir.c | 214 +++++++++++++++++++++++++++++++++++++- > fs/fuse/fuse_i.h | 3 + > include/uapi/linux/fuse.h | 3 + > 3 files changed, 219 insertions(+), 1 deletion(-) > > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c > index e1095852601c..61cdb8e5f68e 100644 > --- a/fs/fuse/dir.c > +++ b/fs/fuse/dir.c > @@ -716,7 +716,7 @@ static int _fuse_create_open(struct inode *dir, struct dentry *entry, > > static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *, > umode_t, dev_t); > -static int fuse_atomic_open(struct inode *dir, struct dentry *entry, > +static int fuse_create_open(struct inode *dir, struct dentry *entry, > struct file *file, unsigned flags, > umode_t mode) > { > @@ -763,6 +763,218 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, > return finish_no_open(file, res); > } > > +static int _fuse_atomic_open(struct inode *dir, struct dentry *entry, > + struct file *file, unsigned int flags, > + umode_t mode) > +{ > + int err; > + struct inode *inode; > + FUSE_ARGS(args); > + struct fuse_mount *fm = get_fuse_mount(dir); > + struct fuse_conn *fc = fm->fc; > + struct fuse_forget_link *forget; > + struct fuse_create_in inarg; > + struct fuse_open_out outopen; > + struct fuse_entry_out outentry; > + struct fuse_inode *fi; > + struct fuse_file *ff; > + struct dentry *switched_entry = NULL, *alias = NULL; > + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); > + > + /* Expect a negative dentry */ > + if (unlikely(d_inode(entry))) > + goto fallback; > + > + /* Userspace expects S_IFREG in create mode */ > + if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG) > + goto fallback; > + > + forget = fuse_alloc_forget(); > + err = -ENOMEM; > + if (!forget) > + goto out_err; > + > + err = -ENOMEM; > + ff = fuse_file_alloc(fm); > + if (!ff) > + goto out_put_forget_req; > + > + if (!fc->dont_mask) > + mode &= ~current_umask(); > + > + flags &= ~O_NOCTTY; > + memset(&inarg, 0, sizeof(inarg)); > + memset(&outentry, 0, sizeof(outentry)); > + inarg.flags = flags; > + inarg.mode = mode; > + inarg.umask = current_umask(); > + > + if (fc->handle_killpriv_v2 && (flags & O_TRUNC) && > + !(flags & O_EXCL) && !capable(CAP_FSETID)) { > + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; > + } > + > + args.opcode = FUSE_OPEN_ATOMIC; > + args.nodeid = get_node_id(dir); > + args.in_numargs = 2; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + args.in_args[1].size = entry->d_name.len + 1; > + args.in_args[1].value = entry->d_name.name; > + args.out_numargs = 2; > + args.out_args[0].size = sizeof(outentry); > + args.out_args[0].value = &outentry; > + args.out_args[1].size = sizeof(outopen); > + args.out_args[1].value = &outopen; > + > + if (flags & O_CREAT) { > + err = get_create_ext(&args, dir, entry, mode); > + if (err) > + goto out_free_ff; > + } > + > + err = fuse_simple_request(fm, &args); > + free_ext_value(&args); > + if (err == -ENOSYS || err == -ELOOP) { > + if (unlikely(err == -ENOSYS)) > + fc->no_open_atomic = 1; > + goto free_and_fallback; > + } > + > + if (!err && !outentry.nodeid) > + err = -ENOENT; > + > + if (err) > + goto out_free_ff; > + > + err = -EIO; > + if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr)) > + goto out_free_ff; > + > + ff->fh = outopen.fh; > + ff->nodeid = outentry.nodeid; > + ff->open_flags = outopen.open_flags; > + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, > + &outentry.attr, ATTR_TIMEOUT(&outentry), 0); > + if (!inode) { > + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); > + fuse_sync_release(NULL, ff, flags); > + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); > + err = -ENOMEM; > + goto out_err; > + } > + > + /* prevent racing/parallel lookup on a negative hashed */ > + if (!(flags & O_CREAT) && !d_in_lookup(entry)) { > + d_drop(entry); > + switched_entry = d_alloc_parallel(entry->d_parent, > + &entry->d_name, &wq); > + if (IS_ERR(switched_entry)) { > + err = PTR_ERR(switched_entry); > + switched_entry = NULL; > + goto out_free_ff; > + } > + > + if (unlikely(!d_in_lookup(switched_entry))) { > + /* fall back */ > + dput(switched_entry); > + switched_entry = NULL; > + goto free_and_fallback; > + } > + > + entry = switched_entry; > + } > + > + if (d_really_is_negative(entry)) { > + d_drop(entry); > + alias = d_exact_alias(entry, inode); > + if (!alias) { > + alias = d_splice_alias(inode, entry); > + if (IS_ERR(alias)) { > + /* > + * Close the file in user space, but do not unlink it, > + * if it was created - with network file systems other > + * clients might have already accessed it. > + */ > + fi = get_fuse_inode(inode); > + fuse_sync_release(fi, ff, flags); > + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); > + err = PTR_ERR(alias); > + goto out_err; > + } > + } > + > + if (alias) > + entry = alias; > + } > + > + fuse_change_entry_timeout(entry, &outentry); > + > + /* File was indeed created */ > + if (outopen.open_flags & FOPEN_FILE_CREATED) { > + if (!(flags & O_CREAT)) { > + pr_debug("Server side bug, FOPEN_FILE_CREATED set " > + "without O_CREAT, ignoring."); > + } else { > + /* This should be always set when the file is created */ > + fuse_dir_changed(dir); > + file->f_mode |= FMODE_CREATED; > + } > + } > + > + if (S_ISDIR(mode)) > + ff->open_flags &= ~FOPEN_DIRECT_IO; > + err = finish_open(file, entry, generic_file_open); > + if (err) { > + fi = get_fuse_inode(inode); > + fuse_sync_release(fi, ff, flags); > + } else { > + file->private_data = ff; > + fuse_finish_open(inode, file); > + } > + > + kfree(forget); > + > + if (switched_entry) { > + d_lookup_done(switched_entry); > + dput(switched_entry); > + } > + > + dput(alias); > + > + return err; > + > +out_free_ff: > + fuse_file_free(ff); > +out_put_forget_req: > + kfree(forget); > +out_err: > + if (switched_entry) { > + d_lookup_done(switched_entry); > + dput(switched_entry); > + } > + > + return err; > + > +free_and_fallback: > + fuse_file_free(ff); > + kfree(forget); > +fallback: > + return fuse_create_open(dir, entry, file, flags, mode); > +} > + > +static int fuse_atomic_open(struct inode *dir, struct dentry *entry, > + struct file *file, unsigned int flags, > + umode_t mode) > +{ > + struct fuse_conn *fc = get_fuse_conn(dir); > + > + if (fc->no_open_atomic) > + return fuse_create_open(dir, entry, file, flags, mode); > + else > + return _fuse_atomic_open(dir, entry, file, flags, mode); > +} > + > /* > * Code shared between mknod, mkdir, symlink and link > */ > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index bf0b85d0b95c..af69578763ef 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -677,6 +677,9 @@ struct fuse_conn { > /** Is open/release not implemented by fs? */ > unsigned no_open:1; > > + /** Is open atomic not implemented by fs? */ > + unsigned no_open_atomic:1; > + > /** Is opendir/releasedir not implemented by fs? */ > unsigned no_opendir:1; > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index db92a7202b34..1508afbd9446 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -353,6 +353,7 @@ struct fuse_file_lock { > * FOPEN_STREAM: the file is stream-like (no file position at all) > * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) > * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode > + * FOPEN_FILE_CREATED: the file was indeed created > */ > #define FOPEN_DIRECT_IO (1 << 0) > #define FOPEN_KEEP_CACHE (1 << 1) > @@ -361,6 +362,7 @@ struct fuse_file_lock { > #define FOPEN_STREAM (1 << 4) > #define FOPEN_NOFLUSH (1 << 5) > #define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) > +#define FOPEN_FILE_CREATED (1 << 7) > > /** > * INIT request/reply flags > @@ -617,6 +619,7 @@ enum fuse_opcode { > FUSE_SYNCFS = 50, > FUSE_TMPFILE = 51, > FUSE_STATX = 52, > + FUSE_OPEN_ATOMIC = 53, > > /* CUSE specific operations */ > CUSE_INIT = 4096, > -- > 2.39.2 > >
On 10/24/23 12:12, Yuan Yao wrote: > Thank you for addressing the symbolic link problems! > Additionally, I found an incompatibility with the no_open feature. > When the FUSE server has the no_open feature enabled, the atomic_open > FUSE request returns a d_entry with an empty file handler and open > option FOPEN_KEEP_CACHE (for files) or FOPEN_CACHE_DIR (for > directories). With this situation, if we can set fc->no_open = 1 or > fc->no_opendir = 1 after receiving the such FUSE reply, as follows, > will make the atomic_open compatible with no_open feature: > + if (!inode) { > + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); > + fuse_sync_release(NULL, ff, flags); > + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); > + err = -ENOMEM; > + goto out_err; > + } > + > + if(ff->fh == 0) { > + if(ff->open_flags & FOPEN_KEEP_CACHE) > + fc->no_open = 1; > + if(ff->open_flags & FOPEN_CACHE_DIR) > + fc->no_opendir = 1; > + } > + > + /* prevent racing/parallel lookup on a negative hashed */ > Thanks again for your review! Hmm, are you sure atomic open needs to handle no-open? fuse_file_open sets no-open / no-opendir on -ENOSYS. _fuse_atomic_open has a handler for -ENOSYS and falls back to the existing create_open. So why does atomic open need a no-open handling? Thanks, Bernd
Currently, if _fuse_atomic_open receives an -ENOSYS error, the no_open_atomic flag is set, preventing further use of atomic_open. However, even with the no_open feature enabled, atomic_open can still provide performance benefits when creating new files due to its ability to combine FUSE lookup and FUSE create operations into a single atomic request. Therefore, I think it would be advantageous to allow these two features to coexist. Thanks, Yuan On Tue, Oct 24, 2023 at 9:36 PM Bernd Schubert <bernd.schubert@fastmail.fm> wrote: > > > > On 10/24/23 12:12, Yuan Yao wrote: > > Thank you for addressing the symbolic link problems! > > Additionally, I found an incompatibility with the no_open feature. > > When the FUSE server has the no_open feature enabled, the atomic_open > > FUSE request returns a d_entry with an empty file handler and open > > option FOPEN_KEEP_CACHE (for files) or FOPEN_CACHE_DIR (for > > directories). With this situation, if we can set fc->no_open = 1 or > > fc->no_opendir = 1 after receiving the such FUSE reply, as follows, > > will make the atomic_open compatible with no_open feature: > > + if (!inode) { > > + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); > > + fuse_sync_release(NULL, ff, flags); > > + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); > > + err = -ENOMEM; > > + goto out_err; > > + } > > + > > + if(ff->fh == 0) { > > + if(ff->open_flags & FOPEN_KEEP_CACHE) > > + fc->no_open = 1; > > + if(ff->open_flags & FOPEN_CACHE_DIR) > > + fc->no_opendir = 1; > > + } > > + > > + /* prevent racing/parallel lookup on a negative hashed */ > > > > Thanks again for your review! > > Hmm, are you sure atomic open needs to handle no-open? fuse_file_open > sets no-open / no-opendir on -ENOSYS. _fuse_atomic_open has a handler > for -ENOSYS and falls back to the existing create_open. So why does > atomic open need a no-open handling? > > > Thanks, > Bernd > > > >
On Mon, Oct 23, 2023 at 08:30:29PM +0200, Bernd Schubert wrote: > +{ > + int err; > + struct inode *inode; > + FUSE_ARGS(args); > + struct fuse_mount *fm = get_fuse_mount(dir); > + struct fuse_conn *fc = fm->fc; > + struct fuse_forget_link *forget; > + struct fuse_create_in inarg; > + struct fuse_open_out outopen; > + struct fuse_entry_out outentry; > + struct fuse_inode *fi; > + struct fuse_file *ff; > + struct dentry *switched_entry = NULL, *alias = NULL; > + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); > + > + /* Expect a negative dentry */ > + if (unlikely(d_inode(entry))) > + goto fallback; > + > + /* Userspace expects S_IFREG in create mode */ > + if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG) > + goto fallback; How could it get there with such mode? We could check that in fs/namei.c:atomic_open() (with if (WARN_ON_ONCE((open_flags & O_CREAT) && !S_ISREG(mode))) error = -EINVAL; // for the lack of EWTFAREYOUSMOKING else error = dir->i_op->atomic_open(....) or something similar), but that doesn't belong in the method instances. And it really should never happen - that thing comes from op->mode and we have build_open_flags() doing this: if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } so... Are other instances doing the same kind of paranoia? That BUG_ON() in current fuse_atomic_open() is bogus (and seriously misplaced)... > + forget = fuse_alloc_forget(); > + err = -ENOMEM; > + if (!forget) > + goto out_err; > + > + err = -ENOMEM; > + ff = fuse_file_alloc(fm); > + if (!ff) > + goto out_put_forget_req; > + > + if (!fc->dont_mask) > + mode &= ~current_umask(); > + > + flags &= ~O_NOCTTY; > + memset(&inarg, 0, sizeof(inarg)); > + memset(&outentry, 0, sizeof(outentry)); > + inarg.flags = flags; > + inarg.mode = mode; > + inarg.umask = current_umask(); > + > + if (fc->handle_killpriv_v2 && (flags & O_TRUNC) && > + !(flags & O_EXCL) && !capable(CAP_FSETID)) { > + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; > + } > + > + args.opcode = FUSE_OPEN_ATOMIC; > + args.nodeid = get_node_id(dir); > + args.in_numargs = 2; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + args.in_args[1].size = entry->d_name.len + 1; > + args.in_args[1].value = entry->d_name.name; > + args.out_numargs = 2; > + args.out_args[0].size = sizeof(outentry); > + args.out_args[0].value = &outentry; > + args.out_args[1].size = sizeof(outopen); > + args.out_args[1].value = &outopen; > + > + if (flags & O_CREAT) { > + err = get_create_ext(&args, dir, entry, mode); > + if (err) > + goto out_free_ff; > + } > + > + err = fuse_simple_request(fm, &args); > + free_ext_value(&args); > + if (err == -ENOSYS || err == -ELOOP) { > + if (unlikely(err == -ENOSYS)) > + fc->no_open_atomic = 1; > + goto free_and_fallback; > + } > + > + if (!err && !outentry.nodeid) > + err = -ENOENT; > + > + if (err) > + goto out_free_ff; > + > + err = -EIO; > + if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr)) > + goto out_free_ff; > + > + ff->fh = outopen.fh; > + ff->nodeid = outentry.nodeid; > + ff->open_flags = outopen.open_flags; > + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, > + &outentry.attr, ATTR_TIMEOUT(&outentry), 0); > + if (!inode) { > + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); > + fuse_sync_release(NULL, ff, flags); > + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); > + err = -ENOMEM; > + goto out_err; > + } > + > + /* prevent racing/parallel lookup on a negative hashed */ > + if (!(flags & O_CREAT) && !d_in_lookup(entry)) { ... in which case it has just passed ->d_revalidate()... > + d_drop(entry); > + switched_entry = d_alloc_parallel(entry->d_parent, > + &entry->d_name, &wq); > + if (IS_ERR(switched_entry)) { > + err = PTR_ERR(switched_entry); > + switched_entry = NULL; > + goto out_free_ff; leaked inode? > + } > + > + if (unlikely(!d_in_lookup(switched_entry))) { > + /* fall back */ > + dput(switched_entry); > + switched_entry = NULL; > + goto free_and_fallback; ditto, and I don't really understand what the hell is going on with dentry references here. What is the intended behaviour in that case? > + } > + > + entry = switched_entry; > + } > + > + if (d_really_is_negative(entry)) { > + d_drop(entry); > + alias = d_exact_alias(entry, inode); What case is that about? "We have an unhashed positive dentry with that exact name, parent and inode"? Where would it have come from? Another thing: this does not consume an inode reference, no matter what gets returned, > + if (!alias) { > + alias = d_splice_alias(inode, entry); but that one *does* consume the inode reference; note the igrab() in nfs4 code where you've nicked that from... > + if (IS_ERR(alias)) { > + /* > + * Close the file in user space, but do not unlink it, > + * if it was created - with network file systems other > + * clients might have already accessed it. > + */ > + fi = get_fuse_inode(inode); ... so this is asking for UAF. > + fuse_sync_release(fi, ff, flags); > + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); > + err = PTR_ERR(alias); > + goto out_err; > + } > + } > + > + if (alias) > + entry = alias; > + } ... and here we have no way to tell if inode needs to be dropped. > + > + fuse_change_entry_timeout(entry, &outentry); > + > + /* File was indeed created */ > + if (outopen.open_flags & FOPEN_FILE_CREATED) { > + if (!(flags & O_CREAT)) { > + pr_debug("Server side bug, FOPEN_FILE_CREATED set " > + "without O_CREAT, ignoring."); > + } else { > + /* This should be always set when the file is created */ > + fuse_dir_changed(dir); > + file->f_mode |= FMODE_CREATED; > + } > + } > + > + if (S_ISDIR(mode)) > + ff->open_flags &= ~FOPEN_DIRECT_IO; > + err = finish_open(file, entry, generic_file_open); > + if (err) { > + fi = get_fuse_inode(inode); > + fuse_sync_release(fi, ff, flags); > + } else { > + file->private_data = ff; > + fuse_finish_open(inode, file); > + } > + > + kfree(forget); > + > + if (switched_entry) { > + d_lookup_done(switched_entry); > + dput(switched_entry); > + } > + > + dput(alias); > + > + return err; > + > +out_free_ff: > + fuse_file_free(ff); > +out_put_forget_req: > + kfree(forget); > +out_err: > + if (switched_entry) { > + d_lookup_done(switched_entry); > + dput(switched_entry); > + } > + > + return err; > + > +free_and_fallback: > + fuse_file_free(ff); > + kfree(forget); > +fallback: > + return fuse_create_open(dir, entry, file, flags, mode); > +}
Thanks a lot for all your reviews, Al! On 10/28/23 05:03, Al Viro wrote: > On Mon, Oct 23, 2023 at 08:30:29PM +0200, Bernd Schubert wrote: >> +{ >> + int err; >> + struct inode *inode; >> + FUSE_ARGS(args); >> + struct fuse_mount *fm = get_fuse_mount(dir); >> + struct fuse_conn *fc = fm->fc; >> + struct fuse_forget_link *forget; >> + struct fuse_create_in inarg; >> + struct fuse_open_out outopen; >> + struct fuse_entry_out outentry; >> + struct fuse_inode *fi; >> + struct fuse_file *ff; >> + struct dentry *switched_entry = NULL, *alias = NULL; >> + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); >> + >> + /* Expect a negative dentry */ >> + if (unlikely(d_inode(entry))) >> + goto fallback; >> + >> + /* Userspace expects S_IFREG in create mode */ >> + if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG) >> + goto fallback; > > How could it get there with such mode? We could check that > in fs/namei.c:atomic_open() (with > if (WARN_ON_ONCE((open_flags & O_CREAT) && !S_ISREG(mode))) > error = -EINVAL; // for the lack of EWTFAREYOUSMOKING > else > error = dir->i_op->atomic_open(....) > or something similar), but that doesn't belong in the method instances. > And it really should never happen - that thing comes from op->mode and > we have build_open_flags() doing this: > if (WILL_CREATE(flags)) { > if (how->mode & ~S_IALLUGO) > return -EINVAL; > op->mode = how->mode | S_IFREG; > } else { > if (how->mode != 0) > return -EINVAL; > op->mode = 0; > } > so... Are other instances doing the same kind of paranoia? That BUG_ON() > in current fuse_atomic_open() is bogus (and seriously misplaced)... Ok, sorry, we took over the check blindly. I added another patch in the v11 branch to remove the BUG_ON in current fuse_atomic_open. > >> + forget = fuse_alloc_forget(); >> + err = -ENOMEM; >> + if (!forget) >> + goto out_err; >> + >> + err = -ENOMEM; >> + ff = fuse_file_alloc(fm); >> + if (!ff) >> + goto out_put_forget_req; >> + >> + if (!fc->dont_mask) >> + mode &= ~current_umask(); >> + >> + flags &= ~O_NOCTTY; >> + memset(&inarg, 0, sizeof(inarg)); >> + memset(&outentry, 0, sizeof(outentry)); >> + inarg.flags = flags; >> + inarg.mode = mode; >> + inarg.umask = current_umask(); >> + >> + if (fc->handle_killpriv_v2 && (flags & O_TRUNC) && >> + !(flags & O_EXCL) && !capable(CAP_FSETID)) { >> + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; >> + } >> + >> + args.opcode = FUSE_OPEN_ATOMIC; >> + args.nodeid = get_node_id(dir); >> + args.in_numargs = 2; >> + args.in_args[0].size = sizeof(inarg); >> + args.in_args[0].value = &inarg; >> + args.in_args[1].size = entry->d_name.len + 1; >> + args.in_args[1].value = entry->d_name.name; >> + args.out_numargs = 2; >> + args.out_args[0].size = sizeof(outentry); >> + args.out_args[0].value = &outentry; >> + args.out_args[1].size = sizeof(outopen); >> + args.out_args[1].value = &outopen; >> + >> + if (flags & O_CREAT) { >> + err = get_create_ext(&args, dir, entry, mode); >> + if (err) >> + goto out_free_ff; >> + } >> + >> + err = fuse_simple_request(fm, &args); >> + free_ext_value(&args); >> + if (err == -ENOSYS || err == -ELOOP) { >> + if (unlikely(err == -ENOSYS)) >> + fc->no_open_atomic = 1; >> + goto free_and_fallback; >> + } >> + >> + if (!err && !outentry.nodeid) >> + err = -ENOENT; >> + >> + if (err) >> + goto out_free_ff; >> + >> + err = -EIO; >> + if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr)) >> + goto out_free_ff; >> + >> + ff->fh = outopen.fh; >> + ff->nodeid = outentry.nodeid; >> + ff->open_flags = outopen.open_flags; >> + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, >> + &outentry.attr, ATTR_TIMEOUT(&outentry), 0); >> + if (!inode) { >> + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); >> + fuse_sync_release(NULL, ff, flags); >> + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); >> + err = -ENOMEM; >> + goto out_err; >> + } >> + >> + /* prevent racing/parallel lookup on a negative hashed */ >> + if (!(flags & O_CREAT) && !d_in_lookup(entry)) { > > ... in which case it has just passed ->d_revalidate()... With the follow up patches that revalidate is going to be moved to this function. Is there anything here that needs to be improved? I had added that check after looking through the other atomic_open methods and then noticed your commit c94c09535c4d: nfs_atomic_open(): prevent parallel nfs_lookup() on a negative hashed > >> + d_drop(entry); >> + switched_entry = d_alloc_parallel(entry->d_parent, >> + &entry->d_name, &wq); >> + if (IS_ERR(switched_entry)) { >> + err = PTR_ERR(switched_entry); >> + switched_entry = NULL; >> + goto out_free_ff; > > leaked inode? Yikes, silly me and with that also leaked fuse userspace inode. > >> + } >> + >> + if (unlikely(!d_in_lookup(switched_entry))) { >> + /* fall back */ >> + dput(switched_entry); >> + switched_entry = NULL; >> + goto free_and_fallback; > > ditto, and I don't really understand what the hell is going on with > dentry references here. What is the intended behaviour in that case? The idea was to give up 'switched_entry' and let the existing fuse_create_open do the fallback work. Hmm, yeah, already called d_drop(dentry). And it also already did the userspace part - fallback without fuse-forget is totally wrong. I guess I need severe patch update because of this - the other parts are not difficult, but here it gets complex. > >> + } >> + >> + entry = switched_entry; >> + } >> + >> + if (d_really_is_negative(entry)) { >> + d_drop(entry); >> + alias = d_exact_alias(entry, inode); > > What case is that about? "We have an unhashed positive dentry with that > exact name, parent and inode"? Where would it have come from? Sorry, taken from _nfs4_open_and_get_state and I wasn't sure if it is needed. A bit lame excuse, but NFS is the only other file system I found that handles !O_CREAT. I definitely should have marked it with something like /* XXX is this really needed, from _nfs4_open_and_get_state */ > > Another thing: this does not consume an inode reference, no matter what > gets returned, > >> + if (!alias) { >> + alias = d_splice_alias(inode, entry); > > but that one *does* consume the inode reference; note the igrab() in > nfs4 code where you've nicked that from... > >> + if (IS_ERR(alias)) { >> + /* >> + * Close the file in user space, but do not unlink it, >> + * if it was created - with network file systems other >> + * clients might have already accessed it. >> + */ >> + fi = get_fuse_inode(inode); > > ... so this is asking for UAF. Yeah, and staring at it again, the below fuse_queue_forget is not right either, as that is already handled through the inode reference / ->evict_inode. > >> + fuse_sync_release(fi, ff, flags); >> + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); >> + err = PTR_ERR(alias); >> + goto out_err; >> + } >> + } >> + >> + if (alias) >> + entry = alias; >> + } > > ... and here we have no way to tell if inode needs to be dropped. I guess I could solve this with another variable, but maybe there is a more beautiful way. I first need to think about the !d_in_lookup(switched_entry). Thanks again so much for your reviews, Bernd
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e1095852601c..61cdb8e5f68e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -716,7 +716,7 @@ static int _fuse_create_open(struct inode *dir, struct dentry *entry, static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -static int fuse_atomic_open(struct inode *dir, struct dentry *entry, +static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) { @@ -763,6 +763,218 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, return finish_no_open(file, res); } +static int _fuse_atomic_open(struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, + umode_t mode) +{ + int err; + struct inode *inode; + FUSE_ARGS(args); + struct fuse_mount *fm = get_fuse_mount(dir); + struct fuse_conn *fc = fm->fc; + struct fuse_forget_link *forget; + struct fuse_create_in inarg; + struct fuse_open_out outopen; + struct fuse_entry_out outentry; + struct fuse_inode *fi; + struct fuse_file *ff; + struct dentry *switched_entry = NULL, *alias = NULL; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + /* Expect a negative dentry */ + if (unlikely(d_inode(entry))) + goto fallback; + + /* Userspace expects S_IFREG in create mode */ + if ((flags & O_CREAT) && (mode & S_IFMT) != S_IFREG) + goto fallback; + + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) + goto out_err; + + err = -ENOMEM; + ff = fuse_file_alloc(fm); + if (!ff) + goto out_put_forget_req; + + if (!fc->dont_mask) + mode &= ~current_umask(); + + flags &= ~O_NOCTTY; + memset(&inarg, 0, sizeof(inarg)); + memset(&outentry, 0, sizeof(outentry)); + inarg.flags = flags; + inarg.mode = mode; + inarg.umask = current_umask(); + + if (fc->handle_killpriv_v2 && (flags & O_TRUNC) && + !(flags & O_EXCL) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + + args.opcode = FUSE_OPEN_ATOMIC; + args.nodeid = get_node_id(dir); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outentry); + args.out_args[0].value = &outentry; + args.out_args[1].size = sizeof(outopen); + args.out_args[1].value = &outopen; + + if (flags & O_CREAT) { + err = get_create_ext(&args, dir, entry, mode); + if (err) + goto out_free_ff; + } + + err = fuse_simple_request(fm, &args); + free_ext_value(&args); + if (err == -ENOSYS || err == -ELOOP) { + if (unlikely(err == -ENOSYS)) + fc->no_open_atomic = 1; + goto free_and_fallback; + } + + if (!err && !outentry.nodeid) + err = -ENOENT; + + if (err) + goto out_free_ff; + + err = -EIO; + if (invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr)) + goto out_free_ff; + + ff->fh = outopen.fh; + ff->nodeid = outentry.nodeid; + ff->open_flags = outopen.open_flags; + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, + &outentry.attr, ATTR_TIMEOUT(&outentry), 0); + if (!inode) { + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); + fuse_sync_release(NULL, ff, flags); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); + err = -ENOMEM; + goto out_err; + } + + /* prevent racing/parallel lookup on a negative hashed */ + if (!(flags & O_CREAT) && !d_in_lookup(entry)) { + d_drop(entry); + switched_entry = d_alloc_parallel(entry->d_parent, + &entry->d_name, &wq); + if (IS_ERR(switched_entry)) { + err = PTR_ERR(switched_entry); + switched_entry = NULL; + goto out_free_ff; + } + + if (unlikely(!d_in_lookup(switched_entry))) { + /* fall back */ + dput(switched_entry); + switched_entry = NULL; + goto free_and_fallback; + } + + entry = switched_entry; + } + + if (d_really_is_negative(entry)) { + d_drop(entry); + alias = d_exact_alias(entry, inode); + if (!alias) { + alias = d_splice_alias(inode, entry); + if (IS_ERR(alias)) { + /* + * Close the file in user space, but do not unlink it, + * if it was created - with network file systems other + * clients might have already accessed it. + */ + fi = get_fuse_inode(inode); + fuse_sync_release(fi, ff, flags); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); + err = PTR_ERR(alias); + goto out_err; + } + } + + if (alias) + entry = alias; + } + + fuse_change_entry_timeout(entry, &outentry); + + /* File was indeed created */ + if (outopen.open_flags & FOPEN_FILE_CREATED) { + if (!(flags & O_CREAT)) { + pr_debug("Server side bug, FOPEN_FILE_CREATED set " + "without O_CREAT, ignoring."); + } else { + /* This should be always set when the file is created */ + fuse_dir_changed(dir); + file->f_mode |= FMODE_CREATED; + } + } + + if (S_ISDIR(mode)) + ff->open_flags &= ~FOPEN_DIRECT_IO; + err = finish_open(file, entry, generic_file_open); + if (err) { + fi = get_fuse_inode(inode); + fuse_sync_release(fi, ff, flags); + } else { + file->private_data = ff; + fuse_finish_open(inode, file); + } + + kfree(forget); + + if (switched_entry) { + d_lookup_done(switched_entry); + dput(switched_entry); + } + + dput(alias); + + return err; + +out_free_ff: + fuse_file_free(ff); +out_put_forget_req: + kfree(forget); +out_err: + if (switched_entry) { + d_lookup_done(switched_entry); + dput(switched_entry); + } + + return err; + +free_and_fallback: + fuse_file_free(ff); + kfree(forget); +fallback: + return fuse_create_open(dir, entry, file, flags, mode); +} + +static int fuse_atomic_open(struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, + umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + + if (fc->no_open_atomic) + return fuse_create_open(dir, entry, file, flags, mode); + else + return _fuse_atomic_open(dir, entry, file, flags, mode); +} + /* * Code shared between mknod, mkdir, symlink and link */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bf0b85d0b95c..af69578763ef 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -677,6 +677,9 @@ struct fuse_conn { /** Is open/release not implemented by fs? */ unsigned no_open:1; + /** Is open atomic not implemented by fs? */ + unsigned no_open_atomic:1; + /** Is opendir/releasedir not implemented by fs? */ unsigned no_opendir:1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index db92a7202b34..1508afbd9446 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -353,6 +353,7 @@ struct fuse_file_lock { * FOPEN_STREAM: the file is stream-like (no file position at all) * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode + * FOPEN_FILE_CREATED: the file was indeed created */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) @@ -361,6 +362,7 @@ struct fuse_file_lock { #define FOPEN_STREAM (1 << 4) #define FOPEN_NOFLUSH (1 << 5) #define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) +#define FOPEN_FILE_CREATED (1 << 7) /** * INIT request/reply flags @@ -617,6 +619,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_OPEN_ATOMIC = 53, /* CUSE specific operations */ CUSE_INIT = 4096,