Message ID | YsN0GURKuaAqXB/e@ZenIV (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [1/7] __follow_mount_rcu(): verify that mount_lock remains unchanged | expand |
Just in case - this series is RFC only; it's very lightly tested. Might be carved into too many steps, at that. Cumulative delta follows; might or might not be more convenient for review. diff --git a/fs/namei.c b/fs/namei.c index 1f28d3f463c3..f2c99e75b578 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -567,7 +567,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; - unsigned seq, m_seq, r_seq; + unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; @@ -772,6 +772,7 @@ static bool try_to_unlazy(struct nameidata *nd) goto out; if (unlikely(!legitimize_root(nd))) goto out; + nd->seq = nd->next_seq = 0; rcu_read_unlock(); BUG_ON(nd->inode != parent->d_inode); return true; @@ -780,6 +781,7 @@ static bool try_to_unlazy(struct nameidata *nd) nd->path.mnt = NULL; nd->path.dentry = NULL; out: + nd->seq = nd->next_seq = 0; rcu_read_unlock(); return false; } @@ -788,7 +790,6 @@ static bool try_to_unlazy(struct nameidata *nd) * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into - * @seq: seq number to check @dentry against * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already @@ -797,7 +798,7 @@ static bool try_to_unlazy(struct nameidata *nd) * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ -static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { BUG_ON(!(nd->flags & LOOKUP_RCU)); @@ -818,7 +819,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + if (unlikely(read_seqcount_retry(&dentry->d_seq, nd->next_seq))) goto out_dput; /* * Sequence counts matched. Now make sure that the root is @@ -826,6 +827,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!legitimize_root(nd))) goto out_dput; + nd->seq = nd->next_seq = 0; rcu_read_unlock(); return true; @@ -834,9 +836,11 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi out1: nd->path.dentry = NULL; out: + nd->seq = nd->next_seq = 0; rcu_read_unlock(); return false; out_dput: + nd->seq = nd->next_seq = 0; rcu_read_unlock(); dput(dentry); return false; @@ -1466,8 +1470,7 @@ EXPORT_SYMBOL(follow_down); * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ -static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode, unsigned *seqp) +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; @@ -1496,15 +1499,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; - *seqp = read_seqcount_begin(&dentry->d_seq); - *inode = dentry->d_inode; - /* - * We don't need to re-check ->d_seq after this - * ->d_inode read - there will be an RCU delay - * between mount hash removal and ->mnt_root - * becoming unpinned. - */ + nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; + // makes sure that non-RCU pathwalk could reach + // this state. + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) @@ -1515,8 +1515,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct inode **inode, - unsigned int *seqp) + struct path *path) { bool jumped; int ret; @@ -1524,16 +1523,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { - unsigned int seq = *seqp; - if (unlikely(!*inode)) - return -ENOENT; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + unsigned int seq = nd->next_seq; + if (likely(__follow_mount_rcu(nd, path))) return 0; - if (!try_to_unlazy_next(nd, dentry, seq)) - return -ECHILD; - // *path might've been clobbered by __follow_mount_rcu() + // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; + nd->next_seq = seq; + if (!try_to_unlazy_next(nd, dentry)) + return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { @@ -1546,9 +1544,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); - } else { - *inode = d_backing_inode(path->dentry); - *seqp = 0; /* out of RCU mode, so the value doesn't matter */ } return ret; } @@ -1607,9 +1602,7 @@ static struct dentry *__lookup_hash(const struct qstr *name, return dentry; } -static struct dentry *lookup_fast(struct nameidata *nd, - struct inode **inode, - unsigned *seqp) +static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1620,37 +1613,24 @@ static struct dentry *lookup_fast(struct nameidata *nd, * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { - unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq); + dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } - /* - * This sequence count validates that the inode matches - * the dentry name information from lookup. - */ - *inode = d_backing_inode(dentry); - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) - return ERR_PTR(-ECHILD); - - /* + /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. - * - * The memory barrier in read_seqcount_begin of child is - * enough, we can use __read_seqcount_retry here. */ - if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) + if (unlikely(read_seqcount_retry(&parent->d_seq, nd->seq))) return ERR_PTR(-ECHILD); - *seqp = seq; status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; - if (!try_to_unlazy_next(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ @@ -1731,7 +1711,7 @@ static inline int may_lookup(struct user_namespace *mnt_userns, return inode_permission(mnt_userns, nd->inode, MAY_EXEC); } -static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) +static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; @@ -1746,7 +1726,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it - bool grabbed_link = legitimize_path(nd, link, seq); + bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; @@ -1760,11 +1740,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq, int flags) + struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link, seq); + int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) @@ -1774,7 +1754,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - last->seq = seq; + last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); @@ -1836,43 +1816,50 @@ static const char *pick_link(struct nameidata *nd, struct path *link, * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. + * + * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, - struct dentry *dentry, struct inode *inode, unsigned seq) + struct dentry *dentry) { struct path path; - int err = handle_mounts(nd, dentry, &path, &inode, &seq); + struct inode *inode; + int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); + inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - if (!(nd->flags & LOOKUP_RCU)) { + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; - nd->seq = seq; + nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, seq)) + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } - return pick_link(nd, &path, inode, seq, flags); + return pick_link(nd, &path, inode, flags); } -static struct dentry *follow_dotdot_rcu(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; @@ -1889,14 +1876,15 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; + // makes sure that non-RCU pathwalk could reach this state if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; - *inodep = parent->d_inode; - *seqp = read_seqcount_begin(&parent->d_seq); + nd->next_seq = read_seqcount_begin(&parent->d_seq); + // makes sure that non-RCU pathwalk could reach this state if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) @@ -1907,12 +1895,11 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); - return NULL; + nd->next_seq = nd->seq; + return nd->path.dentry; } -static struct dentry *follow_dotdot(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; @@ -1936,15 +1923,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd, dput(parent); return ERR_PTR(-ENOENT); } - *seqp = 0; - *inodep = parent->d_inode; return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); - dget(nd->path.dentry); - return NULL; + return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) @@ -1952,8 +1936,6 @@ static const char *handle_dots(struct nameidata *nd, int type) if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; - struct inode *inode; - unsigned seq; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); @@ -1961,17 +1943,12 @@ static const char *handle_dots(struct nameidata *nd, int type) return error; } if (nd->flags & LOOKUP_RCU) - parent = follow_dotdot_rcu(nd, &inode, &seq); + parent = follow_dotdot_rcu(nd); else - parent = follow_dotdot(nd, &inode, &seq); + parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); - if (unlikely(!parent)) - error = step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq); - else - error = step_into(nd, WALK_NOFOLLOW, - parent, inode, seq); + error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; @@ -1995,8 +1972,6 @@ static const char *handle_dots(struct nameidata *nd, int type) static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; - struct inode *inode; - unsigned seq; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and @@ -2007,7 +1982,7 @@ static const char *walk_component(struct nameidata *nd, int flags) put_link(nd); return handle_dots(nd, nd->last_type); } - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { @@ -2017,7 +1992,7 @@ static const char *walk_component(struct nameidata *nd, int flags) } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return step_into(nd, flags, dentry, inode, seq); + return step_into(nd, flags, dentry); } /* @@ -2372,6 +2347,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); + else + nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; @@ -2473,8 +2450,8 @@ static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); - return PTR_ERR(step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq)); + nd->next_seq = nd->seq; + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -3393,8 +3370,6 @@ static const char *open_last_lookups(struct nameidata *nd, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; - unsigned seq; - struct inode *inode; struct dentry *dentry; const char *res; @@ -3410,7 +3385,7 @@ static const char *open_last_lookups(struct nameidata *nd, if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) @@ -3464,7 +3439,7 @@ static const char *open_last_lookups(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - res = step_into(nd, WALK_TRAILING, dentry, inode, seq); + res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res;
On Mon, Jul 4, 2022 at 4:19 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > - unsigned seq, m_seq, r_seq; > + unsigned seq, next_seq, m_seq, r_seq; So the main thing I react to here is how "next_seq" is in the "struct nameidata", but then it always goes together with a "struct dentry" that you end up having to pass separately (and that is *not* in that "struct nameidata"). Now, saving the associated dentry (as "next_dentry") in the nd would solve that, but ends up benign ugly since everything then wants to look at the dentry anyway, so while it would solve the inconsistency, it would be ugly. I wonder if the solution might not be to create a new structure like struct rcu_dentry { struct dentry *dentry; unsigned seq; }; and in fact then we could make __d_lookup_rcu() return one of these things (we already rely on that "returning a two-word structure is efficient" elsewhere). That would then make that "this dentry goes with this sequence number" be a very clear thing, and I actually thjink that it would make __d_lookup_rcu() have a cleaner calling convention too, ie we'd go from dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); rto dseq = __d_lookup_rcu(parent, &nd->last); and it would even improve code generation because it now returns the dentry and the sequence number in registers, instead of returning one in a register and one in memory. I did *not* look at how it would change some of the other places, but I do like the notion of "keep the dentry and the sequence number that goes with it together". That "keep dentry as a local, keep the sequence number that goes with it as a field in the 'nd'" really does seem an odd thing. So I'm throwing the above out as a "maybe we could do this instead..". Not a huge deal. That oddity or not, I think the patch series is an improvement. I do have a minor gripe with this too: > + nd->seq = nd->next_seq = 0; I'm not convinced "0" is a good value. It's not supposed to match anything, but it *could* match a valid sequence number. Wouldn't it be better to pick something that is explicitly invalid and has the low bit set (ie 1 or -1). We don't seem to have a SEQ_INVAL or anything like that, but it does seem that if the intent is to make it clear it's not a real sequence number any more at that point, then 0 isn't great. But again, this is more of a stylistic detail thing than a real complaint. Linus
On Mon, Jul 04, 2022 at 05:06:17PM -0700, Linus Torvalds wrote: > I wonder if the solution might not be to create a new structure like > > struct rcu_dentry { > struct dentry *dentry; > unsigned seq; > }; > > and in fact then we could make __d_lookup_rcu() return one of these > things (we already rely on that "returning a two-word structure is > efficient" elsewhere). > > That would then make that "this dentry goes with this sequence number" > be a very clear thing, and I actually thjink that it would make > __d_lookup_rcu() have a cleaner calling convention too, ie we'd go > from > > dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); > > rto > > dseq = __d_lookup_rcu(parent, &nd->last); > > and it would even improve code generation because it now returns the > dentry and the sequence number in registers, instead of returning one > in a register and one in memory. > > I did *not* look at how it would change some of the other places, but > I do like the notion of "keep the dentry and the sequence number that > goes with it together". > > That "keep dentry as a local, keep the sequence number that goes with > it as a field in the 'nd'" really does seem an odd thing. So I'm > throwing the above out as a "maybe we could do this instead..". I looked into that; turns out to be quite messy, unfortunately. For one thing, the distance between the places where we get the seq count and the place where we consume it is large; worse, there's a bunch of paths where we are in non-RCU mode converging to the same consumer and those need a 0/1/-1/whatever paired with dentry. Gets very clumsy... There might be a clever way to deal with pairs cleanly, but I don't see it at the moment. I'll look into that some more, but... BTW, how good gcc and clang are at figuring out that e.g. static int foo(int n) { if (likely(n >= 0)) return 0; .... } .... if (foo(n)) whatever(); should be treated as if (unlikely(foo(n))) whatever(); They certainly do it just fine if the damn thing is inlined (e.g. all those unlikely(read_seqcount_retry(....)) can and should lose unlikely), but do they manage that for non-inlined functions in the same compilation unit? Relatively recent gcc seems to...
diff --git a/fs/namei.c b/fs/namei.c index 1f28d3f463c3..4dbf55b37ec6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1505,6 +1505,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * becoming unpinned. */ flags = dentry->d_flags; + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq))
Validate mount_lock seqcount as soon as we cross into mount in RCU mode. Sure, ->mnt_root is pinned and will remain so until we do rcu_read_unlock() anyway, and we will eventually fail to unlazy if the mount_lock had been touched, but we might run into a hard error (e.g. -ENOENT) before trying to unlazy. And it's possible to end up with RCU pathwalk racing with rename() and umount() in a way that would fail with -ENOENT while non-RCU pathwalk would've succeeded with any timings. Once upon a time we hadn't needed that, but analysis had been subtle, brittle and went out of window as soon as RENAME_EXCHANGE had been added. It's narrow, hard to hit and won't get you anything other than stray -ENOENT that could be arranged in much easier way with the same priveleges, but it's a bug all the same. Cc: stable@kernel.org X-sky-is-falling: unlikely Fixes: da1ce0670c14 "vfs: add cross-rename" Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+)