Message ID | 20231030003759.GW800259@ZenIV (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] simplifying fast_dput(), dentry_kill() et.al. | expand |
On Mon, Oct 30, 2023 at 12:37:59AM +0000, Al Viro wrote: > Back in 2015 when fast_dput() got introduced, I'd been worried > about ->d_delete() being exposed to dentries with zero refcount. > To quote my reply to Linus back then, > > "The only potential nastiness I can see here is that filesystem with > ->d_delete() always returning 1 might be surprised by encountering > a hashed dentry with zero d_count. I can't recall anything actually > sensitive to that, and there might very well be no such examples, > but in principle it might be a problem. Might be a good idea to check > DCACHE_OP_DELETE before anything else..." > > Looking at that again, that check was not a good idea. Sure, ->d_delete() > instances could, in theory, check d_count (as BUG_ON(d_count(dentry) != 1) > or something equally useful) or, worse, drop and regain ->d_lock. > The latter would be rather hard to pull off safely, but it is not > impossible. The thing is, none of the in-tree instances do anything of > that sort and I don't see any valid reasons why anyone would want to. > > And getting rid of that would, AFAICS, allow for much simpler rules > around __dentry_kill() and friends - we could hold rcu_read_lock > over the places where dentry_kill() drops/regains ->d_lock and > that would allow > * fast_dput() always decrementing refcount > * retain_dentry() never modifying it > * __dentry_kill() always called with refcount 0 (currently > it gets 1 from dentry_kill() and 0 in all other cases) > > Does anybody see any problems with something along the lines of the > (untested) patch below? It would need to be carved up (and accompanied > by "thou shalt not play silly buggers with ->d_lockref in your > ->d_delete() instances" in D/f/porting), obviously, but I would really > like to get saner rules around refcount manipulations in there - as > it is, trying to document them gets very annoying. > > Comments? After fixing a couple of brainos, it seems to work. See below: diff --git a/fs/dcache.c b/fs/dcache.c index 9f471fdb768b..5e975a013508 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry) return false; /* retain; LRU fodder */ - dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) @@ -709,7 +708,7 @@ EXPORT_SYMBOL(d_mark_dontcache); * Returns dentry requiring refcount drop, or NULL if we're done. */ static struct dentry *dentry_kill(struct dentry *dentry) - __releases(dentry->d_lock) + __releases(dentry->d_lock) __releases(rcu) { struct inode *inode = dentry->d_inode; struct dentry *parent = NULL; @@ -730,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry) goto slow_positive; } } + rcu_read_unlock(); __dentry_kill(dentry); return parent; @@ -739,9 +739,8 @@ static struct dentry *dentry_kill(struct dentry *dentry) spin_lock(&dentry->d_lock); parent = lock_parent(dentry); got_locks: - if (unlikely(dentry->d_lockref.count != 1)) { - dentry->d_lockref.count--; - } else if (likely(!retain_dentry(dentry))) { + rcu_read_unlock(); + if (likely(dentry->d_lockref.count == 0 && !retain_dentry(dentry))) { __dentry_kill(dentry); return parent; } @@ -768,15 +767,7 @@ static inline bool fast_dput(struct dentry *dentry) unsigned int d_flags; /* - * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put_or_lock". - */ - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) - return lockref_put_or_lock(&dentry->d_lockref); - - /* - * .. otherwise, we can try to just decrement the - * lockref optimistically. + * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); @@ -787,8 +778,12 @@ static inline bool fast_dput(struct dentry *dentry) */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); - if (dentry->d_lockref.count > 1) { - dentry->d_lockref.count--; + if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { + spin_unlock(&dentry->d_lock); + return true; + } + dentry->d_lockref.count--; + if (dentry->d_lockref.count) { spin_unlock(&dentry->d_lock); return true; } @@ -830,7 +825,7 @@ static inline bool fast_dput(struct dentry *dentry) */ smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE | DCACHE_DISCONNECTED | DCACHE_DONTCACHE; /* Nothing to do? Dropping the reference was all we needed? */ @@ -854,13 +849,6 @@ static inline bool fast_dput(struct dentry *dentry) spin_unlock(&dentry->d_lock); return true; } - - /* - * Re-get the reference we optimistically dropped. We hold the - * lock, and we just tested that it was zero, so we can just - * set it to 1. - */ - dentry->d_lockref.count = 1; return false; } @@ -903,10 +891,9 @@ void dput(struct dentry *dentry) } /* Slow case: now with the dentry lock held */ - rcu_read_unlock(); - if (likely(retain_dentry(dentry))) { spin_unlock(&dentry->d_lock); + rcu_read_unlock(); return; } @@ -918,14 +905,10 @@ EXPORT_SYMBOL(dput); static void __dput_to_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - /* let the owner of the list it's on deal with it */ - --dentry->d_lockref.count; - } else { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); - if (!--dentry->d_lockref.count) - d_shrink_add(dentry, list); + d_shrink_add(dentry, list); } } @@ -1191,7 +1174,7 @@ void shrink_dentry_list(struct list_head *list) rcu_read_unlock(); d_shrink_del(dentry); parent = dentry->d_parent; - if (parent != dentry) + if (parent != dentry && !--parent->d_lockref.count) __dput_to_list(parent, list); __dentry_kill(dentry); } @@ -1638,7 +1621,8 @@ void shrink_dcache_parent(struct dentry *parent) } else { rcu_read_unlock(); parent = data.victim->d_parent; - if (parent != data.victim) + if (parent != data.victim && + !--parent->d_lockref.count) __dput_to_list(parent, &data.dispose); __dentry_kill(data.victim); }
On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: > > After fixing a couple of brainos, it seems to work. This all makes me unnaturally nervous, probably because it;s overly subtle, and I have lost the context for some of the rules. I like the patch, because honestly, our current logic for dput_fast() is nasty, andI agree with you that the existence of d_op->d_delete() shouldn't change the locking logic. At the same time, I just worry. That whole lockref_put_return() thing has horrific semantics, and this is the only case that uses it, and I wish we didn't need it. [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/ too, and that looks completely bogus, since it doesn't check the return value! ] At the same time, that whole fast_dpu() is one of the more critical places, and we definitely don't want to take the lock just because the ref goes down to zero (and we still leave it around). End result: I *think* that patch is an improvement, but this code just makes me unreasonably nervous. Linus
On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote: > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > After fixing a couple of brainos, it seems to work. > > This all makes me unnaturally nervous, probably because it;s overly > subtle, and I have lost the context for some of the rules. A bit of context: I started to look at the possibility of refcount overflows. Writing the current rules for dentry refcounting and lifetime down was the obvious first step, and that immediately turned into an awful mess. It is overly subtle. Even more so when you throw the shrink lists into the mix - shrink_lock_dentry() got too smart for its own good, and that leads to really awful correctness proofs. The next thing in the series is getting rid of the "it had been moved around, so somebody had clearly been taking/dropping references and we can just evict it from the shrink list and be done with that" crap - the things get much simpler if the rules become * call it under rcu_read_lock, with dentry locked * if returned true dentry, parent, inode locked, refcount is zero. * if returned false dentry locked, refcount is non-zero. It used to be that way, but removal of trylock loops had turned that into something much more subtle. Restoring the old semantics without trylocks on the slow path is doable and it makes analysis much simpler. BTW, where how aggressive do we want to be with d_lru_del()? We obviously do not do that on non-final dput, even if we have a dentry with positive refcount in LRU list. But when we hit e.g. shrink_dcache_parent(), all dentries in the subtree get d_lru_del(), whether they are busy or not. I'm not sure it's a good idea... Sure, we want __dentry_kill() to remove the victim from LRU and we want the same done to anything moved to a shrink list. Having LRU scanners ({prune,shrink}_dcache_sb() do that to dentries with positive refcount also makes sense. Do we really need the other cases?
On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote: > On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote: > > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > After fixing a couple of brainos, it seems to work. > > > > This all makes me unnaturally nervous, probably because it;s overly > > subtle, and I have lost the context for some of the rules. > > A bit of context: I started to look at the possibility of refcount overflows. > Writing the current rules for dentry refcounting and lifetime down was the > obvious first step, and that immediately turned into an awful mess. > > It is overly subtle. Even more so when you throw the shrink lists into > the mix - shrink_lock_dentry() got too smart for its own good, and that > leads to really awful correctness proofs. The next thing in the series > is getting rid of the "it had been moved around, so somebody had clearly > been taking/dropping references and we can just evict it from the > shrink list and be done with that" crap - the things get much simpler > if the rules become > * call it under rcu_read_lock, with dentry locked > * if returned true > dentry, parent, inode locked, refcount is zero. > * if returned false > dentry locked, refcount is non-zero. > It used to be that way, but removal of trylock loops had turned that > into something much more subtle. Restoring the old semantics without > trylocks on the slow path is doable and it makes analysis much simpler. It's also a perfect match to what we want in dentry_kill(), actually. And looking into that has caught another place too subtle for its own good: if (!IS_ROOT(dentry)) { parent = dentry->d_parent; if (unlikely(!spin_trylock(&parent->d_lock))) { parent = __lock_parent(dentry); if (likely(inode || !dentry->d_inode)) goto got_locks; /* negative that became positive */ if (parent) spin_unlock(&parent->d_lock); inode = dentry->d_inode; goto slow_positive; } } __dentry_kill(dentry); return parent; slow_positive: spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); parent = lock_parent(dentry); got_locks: That code (in dentry_kill()) relies upon the assumption that positive dentry couldn't have become negative under us while __lock_parent() had it unlocked. Which is only true because we have a positive refcount here. IOW, the patch is broken as posted upthread. It's really not hard to fix, fortunately, and what we end up in dentry_kill() looks a lot better that way - static struct dentry *dentry_kill(struct dentry *dentry) __releases(dentry->d_lock) __releases(rcu) { struct dentry *parent = NULL; if (likely(shrink_lock_dentry(dentry))) { if (!IS_ROOT(dentry)) parent = dentry->d_parent; rcu_read_unlock(); __dentry_kill(dentry); } else { rcu_read_unlock(); spin_unlock(&dentry->d_lock); } return parent; } Carving that series up will be interesting, though...
Hi Linus, On 2023/10/31 06:18, Linus Torvalds wrote: > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: >> >> After fixing a couple of brainos, it seems to work. > > This all makes me unnaturally nervous, probably because it;s overly > subtle, and I have lost the context for some of the rules. > > I like the patch, because honestly, our current logic for dput_fast() > is nasty, andI agree with you that the existence of d_op->d_delete() > shouldn't change the locking logic. > > At the same time, I just worry. That whole lockref_put_return() thing > has horrific semantics, and this is the only case that uses it, and I > wish we didn't need it. > > [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/ > too, and that looks completely bogus, since it doesn't check the > return value! ] 74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, 75 struct erofs_workgroup *grp) 76 { 77 struct erofs_sb_info *const sbi = EROFS_SB(sb); 78 struct erofs_workgroup *pre; 79 80 /* 81 * Bump up before making this visible to others for the XArray in order 82 * to avoid potential UAF without serialized by xa_lock. 83 */ 84 lockref_get(&grp->lockref); 85 86 repeat: 87 xa_lock(&sbi->managed_pslots); 88 pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, 89 NULL, grp, GFP_NOFS); 90 if (pre) { 91 if (xa_is_err(pre)) { 92 pre = ERR_PTR(xa_err(pre)); 93 } else if (!erofs_workgroup_get(pre)) { 94 /* try to legitimize the current in-tree one */ 95 xa_unlock(&sbi->managed_pslots); 96 cond_resched(); 97 goto repeat; 98 } 99 lockref_put_return(&grp->lockref); This line it just decreases the reference count just bumpped up at the line 84 (and it will always succeed). Since it finds a previous one at line 88, so the old one will be used (and be returned) instead of the new one and the new allocated one will be freed in the caller. Hopefully it explains the use case here. 100 grp = pre; 101 } 102 xa_unlock(&sbi->managed_pslots); 103 return grp; 104 } Thanks, Gao Xiang
On 2023/10/31 10:25, Gao Xiang wrote: > Hi Linus, > > On 2023/10/31 06:18, Linus Torvalds wrote: >> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: >>> >>> After fixing a couple of brainos, it seems to work. >> >> This all makes me unnaturally nervous, probably because it;s overly >> subtle, and I have lost the context for some of the rules. >> >> I like the patch, because honestly, our current logic for dput_fast() >> is nasty, andI agree with you that the existence of d_op->d_delete() >> shouldn't change the locking logic. >> >> At the same time, I just worry. That whole lockref_put_return() thing >> has horrific semantics, and this is the only case that uses it, and I >> wish we didn't need it. >> >> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/ >> too, and that looks completely bogus, since it doesn't check the >> return value! ] > > 74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, > 75 struct erofs_workgroup *grp) > 76 { > 77 struct erofs_sb_info *const sbi = EROFS_SB(sb); > 78 struct erofs_workgroup *pre; > 79 > 80 /* > 81 * Bump up before making this visible to others for the XArray in order > 82 * to avoid potential UAF without serialized by xa_lock. > 83 */ > 84 lockref_get(&grp->lockref); > 85 > 86 repeat: > 87 xa_lock(&sbi->managed_pslots); > 88 pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, > 89 NULL, grp, GFP_NOFS); > 90 if (pre) { > 91 if (xa_is_err(pre)) { > 92 pre = ERR_PTR(xa_err(pre)); > 93 } else if (!erofs_workgroup_get(pre)) { > 94 /* try to legitimize the current in-tree one */ > 95 xa_unlock(&sbi->managed_pslots); > 96 cond_resched(); > 97 goto repeat; > 98 } > 99 lockref_put_return(&grp->lockref); > > This line it just decreases the reference count just bumpped up at the > line 84 (and it will always succeed). Add some words: also since it's a new allocated one without populated so it won't be locked by others. > > Since it finds a previous one at line 88, so the old one will be used > (and be returned) instead of the new one and the new allocated one > will be freed in the caller. > > Hopefully it explains the use case here. > > 100 grp = pre; > 101 } > 102 xa_unlock(&sbi->managed_pslots); > 103 return grp; > 104 } > > Thanks, > Gao Xiang >
On Mon, 30 Oct 2023 at 16:25, Gao Xiang <hsiangkao@linux.alibaba.com> wrote: > > > > > [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/ > > too, and that looks completely bogus, since it doesn't check the > > return value! ] > > 74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, > 75 struct erofs_workgroup *grp) > 76 { > 77 struct erofs_sb_info *const sbi = EROFS_SB(sb); > 78 struct erofs_workgroup *pre; > 79 > 80 /* > 81 * Bump up before making this visible to others for the XArray in order > 82 * to avoid potential UAF without serialized by xa_lock. > 83 */ > 84 lockref_get(&grp->lockref); > 85 > 86 repeat: > 87 xa_lock(&sbi->managed_pslots); > 88 pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, > 89 NULL, grp, GFP_NOFS); > 90 if (pre) { > 91 if (xa_is_err(pre)) { > 92 pre = ERR_PTR(xa_err(pre)); > 93 } else if (!erofs_workgroup_get(pre)) { > 94 /* try to legitimize the current in-tree one */ > 95 xa_unlock(&sbi->managed_pslots); > 96 cond_resched(); > 97 goto repeat; > 98 } > 99 lockref_put_return(&grp->lockref); > > This line it just decreases the reference count just bumpped up at the > line 84 (and it will always succeed). You have two possible scenarios: - it doesn't always succeed, because somebody else has the lock on the grp->lockref right now, or because lockref doesn't do any optimized cases at all - nobody else can access grp->lockref at the same time, so the lock is pointless, so you shouldn't be using lockref in the first place, and certainly not lockref_put_return IOW, I don't see how lockref_put_return() could possibly *ever* be the right thing to do. The thing is, lockref_put_return() is fundamentally designed to be something that can fail. In fact, in some situations it will *always* fail. Check this out: #define USE_CMPXCHG_LOCKREF \ (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \ IS_ENABLED(CONFIG_SMP) && SPINLOCK_SIZE <= 4) ... #if USE_CMPXCHG_LOCKREF ... #else #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0) #endif ... int lockref_put_return(struct lockref *lockref) { CMPXCHG_LOOP( new.count--; if (old.count <= 0) return -1; , return new.count; ); return -1; } look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big because of spinlock debugging, or whatever), lockref_put_return() will *always* fail, expecting the caller to deal with that failure. So doing a lockref_put_return() without dealing with the failure case is FUNDAMENTALLY BROKEN. Yes, it's an odd function. It's a function that is literally designed for that dcache use-case, where we have a fast-path and a slow path, and the "lockref_put_return() fails" is the slow-path that needs to take the spinlock and do it carefully. You *cannot* use that function without failure handling. Really. Linus
On 2023/10/31 11:02, Linus Torvalds wrote: > On Mon, 30 Oct 2023 at 16:25, Gao Xiang <hsiangkao@linux.alibaba.com> wrote: >> >>> >>> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/ >>> too, and that looks completely bogus, since it doesn't check the >>> return value! ] >> >> 74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, >> 75 struct erofs_workgroup *grp) >> 76 { >> 77 struct erofs_sb_info *const sbi = EROFS_SB(sb); >> 78 struct erofs_workgroup *pre; >> 79 >> 80 /* >> 81 * Bump up before making this visible to others for the XArray in order >> 82 * to avoid potential UAF without serialized by xa_lock. >> 83 */ >> 84 lockref_get(&grp->lockref); >> 85 >> 86 repeat: >> 87 xa_lock(&sbi->managed_pslots); >> 88 pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, >> 89 NULL, grp, GFP_NOFS); >> 90 if (pre) { >> 91 if (xa_is_err(pre)) { >> 92 pre = ERR_PTR(xa_err(pre)); >> 93 } else if (!erofs_workgroup_get(pre)) { >> 94 /* try to legitimize the current in-tree one */ >> 95 xa_unlock(&sbi->managed_pslots); >> 96 cond_resched(); >> 97 goto repeat; >> 98 } >> 99 lockref_put_return(&grp->lockref); >> >> This line it just decreases the reference count just bumpped up at the >> line 84 (and it will always succeed). > > You have two possible scenarios: > > - it doesn't always succeed, because somebody else has the lock on > the grp->lockref right now, or because lockref doesn't do any > optimized cases at all > > - nobody else can access grp->lockref at the same time, so the lock > is pointless, so you shouldn't be using lockref in the first place, > and certainly not lockref_put_return Yeah, the second case is the real use case here. > > IOW, I don't see how lockref_put_return() could possibly *ever* be the > right thing to do. > > The thing is, lockref_put_return() is fundamentally designed to be > something that can fail. > > In fact, in some situations it will *always* fail. Check this out: > > #define USE_CMPXCHG_LOCKREF \ > (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \ > IS_ENABLED(CONFIG_SMP) && SPINLOCK_SIZE <= 4) > ... > #if USE_CMPXCHG_LOCKREF > ... > #else > > #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0) > > #endif > ... > int lockref_put_return(struct lockref *lockref) > { > CMPXCHG_LOOP( > new.count--; > if (old.count <= 0) > return -1; > , > return new.count; > ); > return -1; > } > > look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big > because of spinlock debugging, or whatever), lockref_put_return() will > *always* fail, expecting the caller to deal with that failure. > > So doing a lockref_put_return() without dealing with the failure case > is FUNDAMENTALLY BROKEN. Yeah, thanks for point out, I get it. I think this really needs to be cleaned up since we don't actually care about locking here (since as I said it doesn't actually populate to XArray). > > Yes, it's an odd function. It's a function that is literally designed > for that dcache use-case, where we have a fast-path and a slow path, > and the "lockref_put_return() fails" is the slow-path that needs to > take the spinlock and do it carefully. > > You *cannot* use that function without failure handling. Really. I will fix+cleanup this path later and send upstream. Thanks for the heads up. Thanks, Gao Xiang > > Linus
On Mon, Oct 30, 2023 at 05:02:32PM -1000, Linus Torvalds wrote: > look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big > because of spinlock debugging, or whatever), lockref_put_return() will > *always* fail, expecting the caller to deal with that failure. > > So doing a lockref_put_return() without dealing with the failure case > is FUNDAMENTALLY BROKEN. > > Yes, it's an odd function. It's a function that is literally designed > for that dcache use-case, where we have a fast-path and a slow path, > and the "lockref_put_return() fails" is the slow-path that needs to > take the spinlock and do it carefully. > > You *cannot* use that function without failure handling. Really. Put another way, it's a fastpath-only thing. Not sure how much use would it be to slap __must_check on it; a warning along the lines of "DON'T USE UNLESS YOU HAVE READ <archive link>" in lockref.h might be useful. BTW, is there any reason for -128 for marking them dead? Looks like -1 wouldn't be any worse...
On Mon, 30 Oct 2023 at 17:26, Al Viro <viro@zeniv.linux.org.uk> wrote: > > BTW, is there any reason for -128 for marking them dead? Looks like > -1 wouldn't be any worse... It's *much* too long ago, but I have this dim memory of simply wanting to make sure that "dead" was clearly separate from "underflow". Linus
On Tue, Oct 31, 2023 at 01:53:51AM +0000, Al Viro wrote:
> Carving that series up will be interesting, though...
I think I have a sane carve-up; will post if it survives testing.
Cumulative diff follows:
diff --git a/fs/dcache.c b/fs/dcache.c
index 9f471fdb768b..213026d5b033 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -625,8 +625,6 @@ static void __dentry_kill(struct dentry *dentry)
static struct dentry *__lock_parent(struct dentry *dentry)
{
struct dentry *parent;
- rcu_read_lock();
- spin_unlock(&dentry->d_lock);
again:
parent = READ_ONCE(dentry->d_parent);
spin_lock(&parent->d_lock);
@@ -642,7 +640,6 @@ static struct dentry *__lock_parent(struct dentry *dentry)
spin_unlock(&parent->d_lock);
goto again;
}
- rcu_read_unlock();
if (parent != dentry)
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
else
@@ -657,7 +654,64 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
return NULL;
if (likely(spin_trylock(&parent->d_lock)))
return parent;
- return __lock_parent(dentry);
+ rcu_read_lock();
+ spin_unlock(&dentry->d_lock);
+ parent = __lock_parent(dentry);
+ rcu_read_unlock();
+ return parent;
+}
+
+/*
+ * Lock a dentry for feeding it to __dentry_kill().
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry is busy. Otherwise, return true and have
+ * that dentry's inode and parent both locked.
+ */
+
+static bool lock_for_kill(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ struct dentry *parent = dentry->d_parent;
+
+ if (unlikely(dentry->d_lockref.count))
+ return false;
+
+ if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+ goto slow;
+ if (dentry == parent)
+ return true;
+ if (likely(spin_trylock(&parent->d_lock)))
+ return true;
+
+ if (inode)
+ spin_unlock(&inode->i_lock);
+slow:
+ spin_unlock(&dentry->d_lock);
+
+ for (;;) {
+ if (inode)
+ spin_lock(&inode->i_lock);
+ parent = __lock_parent(dentry);
+ if (likely(inode == dentry->d_inode))
+ break;
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ inode = dentry->d_inode;
+ spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ }
+ if (likely(!dentry->d_lockref.count))
+ return true;
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ return false;
}
static inline bool retain_dentry(struct dentry *dentry)
@@ -680,7 +734,6 @@ static inline bool retain_dentry(struct dentry *dentry)
return false;
/* retain; LRU fodder */
- dentry->d_lockref.count--;
if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
d_lru_add(dentry);
else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -703,61 +756,12 @@ void d_mark_dontcache(struct inode *inode)
}
EXPORT_SYMBOL(d_mark_dontcache);
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
- __releases(dentry->d_lock)
-{
- struct inode *inode = dentry->d_inode;
- struct dentry *parent = NULL;
-
- if (inode && unlikely(!spin_trylock(&inode->i_lock)))
- goto slow_positive;
-
- if (!IS_ROOT(dentry)) {
- parent = dentry->d_parent;
- if (unlikely(!spin_trylock(&parent->d_lock))) {
- parent = __lock_parent(dentry);
- if (likely(inode || !dentry->d_inode))
- goto got_locks;
- /* negative that became positive */
- if (parent)
- spin_unlock(&parent->d_lock);
- inode = dentry->d_inode;
- goto slow_positive;
- }
- }
- __dentry_kill(dentry);
- return parent;
-
-slow_positive:
- spin_unlock(&dentry->d_lock);
- spin_lock(&inode->i_lock);
- spin_lock(&dentry->d_lock);
- parent = lock_parent(dentry);
-got_locks:
- if (unlikely(dentry->d_lockref.count != 1)) {
- dentry->d_lockref.count--;
- } else if (likely(!retain_dentry(dentry))) {
- __dentry_kill(dentry);
- return parent;
- }
- /* we are keeping it, after all */
- if (inode)
- spin_unlock(&inode->i_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
- spin_unlock(&dentry->d_lock);
- return NULL;
-}
-
/*
* Try to do a lockless dput(), and return whether that was successful.
*
* If unsuccessful, we return false, having already taken the dentry lock.
+ * In that case refcount is guaranteed to be zero and we have already
+ * decided that it's not worth keeping around.
*
* The caller needs to hold the RCU read lock, so that the dentry is
* guaranteed to stay around even if the refcount goes down to zero!
@@ -768,15 +772,7 @@ static inline bool fast_dput(struct dentry *dentry)
unsigned int d_flags;
/*
- * If we have a d_op->d_delete() operation, we sould not
- * let the dentry count go to zero, so use "put_or_lock".
- */
- if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
- return lockref_put_or_lock(&dentry->d_lockref);
-
- /*
- * .. otherwise, we can try to just decrement the
- * lockref optimistically.
+ * try to decrement the lockref optimistically.
*/
ret = lockref_put_return(&dentry->d_lockref);
@@ -787,12 +783,12 @@ static inline bool fast_dput(struct dentry *dentry)
*/
if (unlikely(ret < 0)) {
spin_lock(&dentry->d_lock);
- if (dentry->d_lockref.count > 1) {
- dentry->d_lockref.count--;
+ if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
spin_unlock(&dentry->d_lock);
return true;
}
- return false;
+ dentry->d_lockref.count--;
+ goto locked;
}
/*
@@ -830,7 +826,7 @@ static inline bool fast_dput(struct dentry *dentry)
*/
smp_rmb();
d_flags = READ_ONCE(dentry->d_flags);
- d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+ d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
/* Nothing to do? Dropping the reference was all we needed? */
@@ -850,17 +846,11 @@ static inline bool fast_dput(struct dentry *dentry)
* else could have killed it and marked it dead. Either way, we
* don't need to do anything else.
*/
- if (dentry->d_lockref.count) {
+locked:
+ if (dentry->d_lockref.count || retain_dentry(dentry)) {
spin_unlock(&dentry->d_lock);
return true;
}
-
- /*
- * Re-get the reference we optimistically dropped. We hold the
- * lock, and we just tested that it was zero, so we can just
- * set it to 1.
- */
- dentry->d_lockref.count = 1;
return false;
}
@@ -903,29 +893,29 @@ void dput(struct dentry *dentry)
}
/* Slow case: now with the dentry lock held */
- rcu_read_unlock();
-
- if (likely(retain_dentry(dentry))) {
+ if (likely(lock_for_kill(dentry))) {
+ struct dentry *parent = dentry->d_parent;
+ rcu_read_unlock();
+ __dentry_kill(dentry);
+ if (dentry == parent)
+ return;
+ dentry = parent;
+ } else {
+ rcu_read_unlock();
spin_unlock(&dentry->d_lock);
return;
}
-
- dentry = dentry_kill(dentry);
}
}
EXPORT_SYMBOL(dput);
-static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
- if (dentry->d_flags & DCACHE_SHRINK_LIST) {
- /* let the owner of the list it's on deal with it */
- --dentry->d_lockref.count;
- } else {
+ if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
if (dentry->d_flags & DCACHE_LRU_LIST)
d_lru_del(dentry);
- if (!--dentry->d_lockref.count)
- d_shrink_add(dentry, list);
+ d_shrink_add(dentry, list);
}
}
@@ -937,8 +927,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
return;
}
rcu_read_unlock();
- if (!retain_dentry(dentry))
- __dput_to_list(dentry, list);
+ to_shrink_list(dentry, list);
spin_unlock(&dentry->d_lock);
}
@@ -1117,58 +1106,6 @@ void d_prune_aliases(struct inode *inode)
}
EXPORT_SYMBOL(d_prune_aliases);
-/*
- * Lock a dentry from shrink list.
- * Called under rcu_read_lock() and dentry->d_lock; the former
- * guarantees that nothing we access will be freed under us.
- * Note that dentry is *not* protected from concurrent dentry_kill(),
- * d_delete(), etc.
- *
- * Return false if dentry has been disrupted or grabbed, leaving
- * the caller to kick it off-list. Otherwise, return true and have
- * that dentry's inode and parent both locked.
- */
-static bool shrink_lock_dentry(struct dentry *dentry)
-{
- struct inode *inode;
- struct dentry *parent;
-
- if (dentry->d_lockref.count)
- return false;
-
- inode = dentry->d_inode;
- if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
- spin_unlock(&dentry->d_lock);
- spin_lock(&inode->i_lock);
- spin_lock(&dentry->d_lock);
- if (unlikely(dentry->d_lockref.count))
- goto out;
- /* changed inode means that somebody had grabbed it */
- if (unlikely(inode != dentry->d_inode))
- goto out;
- }
-
- parent = dentry->d_parent;
- if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
- return true;
-
- spin_unlock(&dentry->d_lock);
- spin_lock(&parent->d_lock);
- if (unlikely(parent != dentry->d_parent)) {
- spin_unlock(&parent->d_lock);
- spin_lock(&dentry->d_lock);
- goto out;
- }
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (likely(!dentry->d_lockref.count))
- return true;
- spin_unlock(&parent->d_lock);
-out:
- if (inode)
- spin_unlock(&inode->i_lock);
- return false;
-}
-
void shrink_dentry_list(struct list_head *list)
{
while (!list_empty(list)) {
@@ -1177,12 +1114,11 @@ void shrink_dentry_list(struct list_head *list)
dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
rcu_read_lock();
- if (!shrink_lock_dentry(dentry)) {
+ if (!lock_for_kill(dentry)) {
bool can_free = false;
rcu_read_unlock();
d_shrink_del(dentry);
- if (dentry->d_lockref.count < 0)
- can_free = dentry->d_flags & DCACHE_MAY_FREE;
+ can_free = dentry->d_flags & DCACHE_MAY_FREE;
spin_unlock(&dentry->d_lock);
if (can_free)
dentry_free(dentry);
@@ -1191,8 +1127,8 @@ void shrink_dentry_list(struct list_head *list)
rcu_read_unlock();
d_shrink_del(dentry);
parent = dentry->d_parent;
- if (parent != dentry)
- __dput_to_list(parent, list);
+ if (parent != dentry && !--parent->d_lockref.count)
+ to_shrink_list(parent, list);
__dentry_kill(dentry);
}
}
@@ -1632,14 +1568,15 @@ void shrink_dcache_parent(struct dentry *parent)
if (data.victim) {
struct dentry *parent;
spin_lock(&data.victim->d_lock);
- if (!shrink_lock_dentry(data.victim)) {
+ if (!lock_for_kill(data.victim)) {
spin_unlock(&data.victim->d_lock);
rcu_read_unlock();
} else {
rcu_read_unlock();
parent = data.victim->d_parent;
- if (parent != data.victim)
- __dput_to_list(parent, &data.dispose);
+ if (parent != data.victim &&
+ !--parent->d_lockref.count)
+ to_shrink_list(parent, &data.dispose);
__dentry_kill(data.victim);
}
}
[NFS folks Cc'd] On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote: > On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote: > > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > After fixing a couple of brainos, it seems to work. > > > > This all makes me unnaturally nervous, probably because it;s overly > > subtle, and I have lost the context for some of the rules. > > A bit of context: I started to look at the possibility of refcount overflows. > Writing the current rules for dentry refcounting and lifetime down was the > obvious first step, and that immediately turned into an awful mess. > > It is overly subtle. Even more so when you throw the shrink lists into > the mix - shrink_lock_dentry() got too smart for its own good, and that > leads to really awful correctness proofs. ... and for another example of subtle shit, consider DCACHE_NORCU. Recall c0eb027e5aef "vfs: don't do RCU lookup of empty pathnames" and note that it relies upon never getting results of alloc_file_pseudo() with directory inode anywhere near descriptor tables. Back then I basically went "fine, nobody would ever use alloc_file_pseudo() for that anyway", but... there's a call in __nfs42_ssc_open() that doesn't have any obvious protection against ending up with directory inode. That does not end up anywhere near descriptor tables, as far as I can tell, fortunately. Unfortunately, it is quite capable of fucking the things up in different ways, even if it's promptly closed. d_instantiate() on directory inode is a really bad thing; a plenty of places expect to have only one alias for those, and would be very unhappy with that kind of crap without any RCU considerations. I'm pretty sure that this NFS code really does not want to use that for directories; the simplest solution would be to refuse alloc_file_pseudo() for directory inodes. NFS folks - do you have a problem with the following patch? ====== Make sure we never feed a directory to alloc_file_pseudo() That would be broken in a lot of ways, from UAF in pathwalk if that thing ever gets into descriptor tables, to royally screwing every place that relies upon the lack of aliases for directory inodes (i.e. quite a bit of VFS). Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- diff --git a/fs/file_table.c b/fs/file_table.c index ee21b3da9d08..5331a696896e 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -326,6 +326,9 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, struct path path; struct file *file; + if (WARN_ON_ONCE(S_ISDIR(inode->i_mode))) + return ERR_PTR(-EISDIR); + path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM);
On Tue, Oct 31, 2023 at 06:12:26AM +0000, Al Viro wrote: > On Tue, Oct 31, 2023 at 01:53:51AM +0000, Al Viro wrote: > > > Carving that series up will be interesting, though... > > I think I have a sane carve-up; will post if it survives testing. OK, current variant survives local testing. Lives in git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache Shortlog: fast_dput(): having ->d_delete() is not reason to delay refcount decrement fast_dput(): handle underflows gracefully fast_dput(): new rules for refcount __dput_to_list(): do decrement of refcount in the caller retain_dentry(): lift decrement of ->d_count into callers __dentry_kill(): get consistent rules for ->d_count dentry_kill(): don't bother with retain_dentry() on slow path Call retain_dentry() with refcount 0 fold the call of retain_dentry() into fast_dput() don't try to cut corners in shrink_lock_dentry() fold dentry_kill() into dput() get rid of __dget() shrink_dentry_list(): no need to check that dentry refcount is marked dead to_shrink_list(): call only if refcount is 0 switch select_collect{,2}() to use of to_shrink_list() Diffstat: fs/dcache.c | 268 ++++++++++++++++++++++-------------------------------------- 1 file changed, 96 insertions(+), 172 deletions(-) Individual patches in followups. Review and testing would be welcome, and it's obviously next cycle fodder. Massage of refcounting is in the first 11 commits, last 4 are followups.
On 31 Oct 2023, at 22:22, Al Viro wrote: > [NFS folks Cc'd] > > On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote: >> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote: >>> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: >>>> >>>> After fixing a couple of brainos, it seems to work. >>> >>> This all makes me unnaturally nervous, probably because it;s overly >>> subtle, and I have lost the context for some of the rules. >> >> A bit of context: I started to look at the possibility of refcount overflows. >> Writing the current rules for dentry refcounting and lifetime down was the >> obvious first step, and that immediately turned into an awful mess. >> >> It is overly subtle. Even more so when you throw the shrink lists into >> the mix - shrink_lock_dentry() got too smart for its own good, and that >> leads to really awful correctness proofs. > > ... and for another example of subtle shit, consider DCACHE_NORCU. Recall > c0eb027e5aef "vfs: don't do RCU lookup of empty pathnames" and note that > it relies upon never getting results of alloc_file_pseudo() with directory > inode anywhere near descriptor tables. > > Back then I basically went "fine, nobody would ever use alloc_file_pseudo() > for that anyway", but... there's a call in __nfs42_ssc_open() that doesn't > have any obvious protection against ending up with directory inode. > That does not end up anywhere near descriptor tables, as far as I can tell, > fortunately. > > Unfortunately, it is quite capable of fucking the things up in different > ways, even if it's promptly closed. d_instantiate() on directory inode > is a really bad thing; a plenty of places expect to have only one alias > for those, and would be very unhappy with that kind of crap without any > RCU considerations. > > I'm pretty sure that this NFS code really does not want to use that for > directories; the simplest solution would be to refuse alloc_file_pseudo() > for directory inodes. NFS folks - do you have a problem with the > following patch? It would be a protocol violation to use COPY on a directory: https://www.rfc-editor.org/rfc/rfc7862.html#section-15.2.3 Both SAVED_FH and CURRENT_FH must be regular files. If either SAVED_FH or CURRENT_FH is not a regular file, the operation MUST fail and return NFS4ERR_WRONG_TYPE. so nfsd4_verify_copy() does S_ISREG() checks before alloc_file_pseudo(). Ben
On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote: > On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote: > > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > After fixing a couple of brainos, it seems to work. > > > > This all makes me unnaturally nervous, probably because it;s overly > > subtle, and I have lost the context for some of the rules. > > A bit of context: I started to look at the possibility of refcount overflows. > Writing the current rules for dentry refcounting and lifetime down was the > obvious first step, and that immediately turned into an awful mess. > > It is overly subtle. Another piece of too subtle shite: ordering of ->d_iput() of child and __dentry_kill() of parent. As it is, in some cases it is possible for the latter to happen before the former. It is *not* possible in the cases when in-tree ->d_iput() instances actually look at the parent (all of those are due to sillyrename stuff), but the proof is convoluted and very brittle. The origin of that mess is in the interaction of shrink_dcache_for_umount() with shrink_dentry_list(). What we want to avoid is a directory looking like it's busy since shrink_dcache_for_umount() doesn't see any children to account for positive refcount of parent. The kinda-sorta solution we use is to decrement the parent's refcount *before* __dentry_kill() of child and put said parent into a shrink list. That makes shrink_dcache_for_umount() do the right thing, but it's possible to end up with parent freed before the child is done with; scenario is non-obvious, and rather hard to hit, but it's not impossible. dput() does no such thing - it does not decrement the parent's refcount until the child had been taken care of. That's fine, as far as shrink_dcache_for_umount() is concerned - this is not a false positive; with slightly different timing shrink_dcache_for_umount() would've reported the child as being busy. IOW, there should be no overlap between dput() in one thread and shrink_dcache_for_umount() in another. Unfortunately, memory eviction *can* come in the middle of shrink_dcache_for_umount(). Life would be much simpler if shrink_dentry_list() would not have to pull that kind of tricks and used the same ordering as dput() does. IMO there's a reasonably cheap way to achieve that: * have shrink_dcache_for_umount() mark the superblock (either in ->s_flags or inside the ->s_dentry_lru itself) and have the logics in retain_dentry() that does insertion into LRU list check ->d_sb for that mark, treating its presence as "do not retain". * after marking the superblock shrink_dentry_for_umount() is guaranteed that nothing new will be added to shrink list in question. Have it call shrink_dcache_sb() to drain LRU. * Now shrink_dentry_list() in one thread hitting a dentry on a superblock going throug shrink_dcache_for_umount() in another thread is always a bug and reporting busy dentries is the right thing to do. So we can switch shrink_dentry_list() to the same "drop reference to parent only after the child had been killed" ordering as we have in dput(). IMO that removes a fairly nasty trap for ->d_iput() and ->d_delete() instances. As for the overhead, the relevant fragment of retain_dentry() is if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) dentry->d_flags |= DCACHE_REFERENCED; return true; That would become if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { if (unlikely(dentry->d_sb is marked)) return false; d_lru_add(dentry); } else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) dentry->d_flags |= DCACHE_REFERENCED; return true; Note that d_lru_add() will hit ->d_sb->s_dentry_lru, so we are not adding memory traffic here; the else if part doesn't need to be touched - we only need to prevent insertions into LRU. Comments?
On Sun, Nov 05, 2023 at 07:54:16PM +0000, Al Viro wrote: > * have shrink_dcache_for_umount() mark the superblock (either in > ->s_flags or inside the ->s_dentry_lru itself) and have the logics > in retain_dentry() that does insertion into LRU list check ->d_sb for that > mark, treating its presence as "do not retain". BTW, no barriers, etc. are needed for those - once we are into shrink_dentry_for_umount() there must be no dput() calls anywhere on that filesystem.
On Sun, Nov 05, 2023 at 07:54:16PM +0000, Al Viro wrote: > On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote: > > On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote: > > > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > > > After fixing a couple of brainos, it seems to work. > > > > > > This all makes me unnaturally nervous, probably because it;s overly > > > subtle, and I have lost the context for some of the rules. > > > > A bit of context: I started to look at the possibility of refcount overflows. > > Writing the current rules for dentry refcounting and lifetime down was the > > obvious first step, and that immediately turned into an awful mess. > > > > It is overly subtle. > > Another piece of too subtle shite: ordering of ->d_iput() of child > and __dentry_kill() of parent. As it is, in some cases it is possible for > the latter to happen before the former. It is *not* possible in the cases > when in-tree ->d_iput() instances actually look at the parent (all of those > are due to sillyrename stuff), but the proof is convoluted and very brittle. > > The origin of that mess is in the interaction of shrink_dcache_for_umount() > with shrink_dentry_list(). What we want to avoid is a directory looking like > it's busy since shrink_dcache_for_umount() doesn't see any children to account > for positive refcount of parent. The kinda-sorta solution we use is to decrement > the parent's refcount *before* __dentry_kill() of child and put said parent > into a shrink list. That makes shrink_dcache_for_umount() do the right thing, > but it's possible to end up with parent freed before the child is done with; > scenario is non-obvious, and rather hard to hit, but it's not impossible. D'oh... We actually don't need to worry about eviction on memory pressure at that point; unregister_shrinker() is done early enough to prevent that. So shrink_dcache_for_umount() does not need to worry about shrink lists use in prune_dcache_sb(). Use in namespace_unlock() is guaranteed that all dentries involved either have a matching mount in the list of mounts to be dropped (and thus protected from simultaneous fs shutdown) or have a matching mount pinned by the caller. Use in mntput_no_expire() is guaranteed the same - all dentries involved are on superblock of mount we are going to drop after the call of shrink_dentry_list(). All other users also either have an active reference to superblock or are done by ->kill_sb() synchronously (and thus can't race with shrink_dcache_for_umount()) or are done async, but flushed and/or waited for by foofs_kill_sb() before they get to shrink_dcache_for_umount(). IOW, I'd been too paranoid in "Teach shrink_dcache_parent() to cope with mixed-filesystem shrink lists" - the real requirements are milder; in-tree users didn't need these games with parent. Dcache side of Tobin's Slab Movable Objects patches needed those, though... AFAICS, there are 3 options: 1) leave the current weirdness with ->d_iput() on child vs __dentry_kill() on parent. Document the requirement to ->d_iput() (and ->d_release()) to cope with that, promise that in case of sillyrename the ordering will be there and write down the proof of that. No code changes, rather revolting docs to write, trouble waiting to happen in ->d_iput(). 2) require that shrink_dentry_list() should never overlap with shrink_dcache_for_umount() on any of the filesystems represented in the shrink list, guarantee that parent won't get to __dentry_kill() before the child gets through __dentry_kill() completely and accept that resurrecting SMO stuff will require more work. Smallish patch, tolerable docs, probably the best option at the moment. 3) bite the bullet and get shrink_dentry_list() to coexist with shrink_dcache_for_umount(), with sane ordering of ->d_iput() vs. parent's __dentry_kill(). Doable, but AFAICS it will take a counter of children currently being killed in the parent dentry. shrink_dentry_list() would bump that on parent, __dentry_kill() the victim, then relock the parent and decrement that counter along with the main refcount. That would allow the shrink_dcache_for_umount() to cope with that crap. No requirements for shrink_dentry_kill() callers that way, sane environment for ->d_iput(), no obstacles for SMO stuff. OTOH, we need to get space for additional counter in struct dentry; again, doable (->d_subdirs/->d_child can be converted to hlist, saving us a pointer in each dentry), but... I'd leave that option alone until something that needs it would show up (e.g. if/when Tobin resurrects his patchset). My preference would be (2) for the coming cycle + prototype of a patch doing (3) on top of that for the future. Completely untested diff for (2) (on top of #work.dcache, sans the documentation update) below: diff --git a/fs/dcache.c b/fs/dcache.c index ccf41c5ee804..c978207f3fc4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1086,10 +1086,27 @@ void d_prune_aliases(struct inode *inode) } EXPORT_SYMBOL(d_prune_aliases); +static inline void shrink_kill(struct dentry *victim, struct list_head *list) +{ + struct dentry *parent = victim->d_parent; + + __dentry_kill(victim); + + if (parent == victim || lockref_put_or_lock(&parent->d_lockref)) + return; + + if (!WARN_ON_ONCE(parent->d_lockref.count != 1)) { + parent->d_lockref.count = 0; + to_shrink_list(parent, list); + } + spin_unlock(&parent->d_lock); +} + + void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { - struct dentry *dentry, *parent; + struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); @@ -1106,10 +1123,7 @@ void shrink_dentry_list(struct list_head *list) } rcu_read_unlock(); d_shrink_del(dentry); - parent = dentry->d_parent; - if (parent != dentry && !--parent->d_lockref.count) - to_shrink_list(parent, list); - __dentry_kill(dentry); + shrink_kill(dentry, list); } } @@ -1537,19 +1551,14 @@ void shrink_dcache_parent(struct dentry *parent) break; data.victim = NULL; d_walk(parent, &data, select_collect2); - if (data.victim) { - struct dentry *parent; + if (data.victim) { // rcu_read_lock held - see select_collect2() spin_lock(&data.victim->d_lock); if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { rcu_read_unlock(); - parent = data.victim->d_parent; - if (parent != data.victim && - !--parent->d_lockref.count) - to_shrink_list(parent, &data.dispose); - __dentry_kill(data.victim); + shrink_kill(data.victim, &data.dispose); } } if (!list_empty(&data.dispose))
On Mon, Nov 06, 2023 at 05:53:53AM +0000, Al Viro wrote: > AFAICS, there are 3 options: > 1) leave the current weirdness with ->d_iput() on child vs __dentry_kill() > on parent. Document the requirement to ->d_iput() (and ->d_release()) to cope > with that, promise that in case of sillyrename the ordering will be there and > write down the proof of that. No code changes, rather revolting docs to > write, trouble waiting to happen in ->d_iput(). > 2) require that shrink_dentry_list() should never overlap with > shrink_dcache_for_umount() on any of the filesystems represented in the > shrink list, guarantee that parent won't get to __dentry_kill() before > the child gets through __dentry_kill() completely and accept that resurrecting > SMO stuff will require more work. Smallish patch, tolerable docs, probably > the best option at the moment. > 3) bite the bullet and get shrink_dentry_list() to coexist with > shrink_dcache_for_umount(), with sane ordering of ->d_iput() vs. parent's > __dentry_kill(). Doable, but AFAICS it will take a counter of children > currently being killed in the parent dentry. shrink_dentry_list() would > bump that on parent, __dentry_kill() the victim, then relock the parent > and decrement that counter along with the main refcount. That would allow > the shrink_dcache_for_umount() to cope with that crap. No requirements > for shrink_dentry_kill() callers that way, sane environment for ->d_iput(), > no obstacles for SMO stuff. OTOH, we need to get space for additional > counter in struct dentry; again, doable (->d_subdirs/->d_child can be > converted to hlist, saving us a pointer in each dentry), but... I'd > leave that option alone until something that needs it would show up > (e.g. if/when Tobin resurrects his patchset). 4) instead of having __dentry_kill() called with dentry, parent and inode locked and doing ->d_prune unhash remove from list of children unlock parent detach from inode unlock dentry and inode drop inode ->d_release relock dentry if on shrink list, mark as ready to free unlock dentry if was not on shrink list, free it go for calling it with just dentry and inode locked and do ->d_prune unhash detach from inode unlock dentry and inode drop inode ->d_release lock parent (if any, as usual) lock dentry remove from list of children if on shrink list, mark as ready to free unlock dentry if was on shrink list, free it decrement parent's refcount (again, if there was a parent) if refcount is still positive - unlock parent and return NULL otherwise return parent What changes: * caller needs milder locking environment; lock_for_kill() gets simpler. Note that only positive dentries can be moved, so inside __dentry_kill() we need no retry loops, etc. - ->d_parent is stable by the point we decide to remove from the list of children. * code that iterates through the list of children (not much of it) needs to cope with seeing negative unhashed dentries with refcount marked dead. Most of it will need no changes at all. * ->d_prune() instances are called without parent's ->d_lock; just the victim's one. Might require changes to out-of-tree filesystems. * dput() turns into if (!dentry) return; rcu_read_lock() if (fast_dput(dentry)) { rcu_read_unlock(); return; } while (lock_for_kill(dentry)) { // not bothering with the parent rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } spin_unlock(&dentry->d_lock); rcu_read_unlock(); since there's no point trying to avoid locking the parents - we need to grab those locks at some point anyway, just to remove a child from the list of children, and that way we return from __dentry_kill() with that lock held. * shrink_dentry_list() eviction of parents happens thus: do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); * sane order of ->d_iput() on child vs. __dentry_kill() on parent. * shrink_dcache_for_umount() does the right thing even if it overlaps shrink_dentry_list(). If that works, it's probably the best variant...
The series below is the fallout of trying to document the dentry refcounting and life cycle - basically, getting rid of the bits that had been too subtle and ugly to write them up. Results so far: * -136LoC (-170LoC not counting the additions in D/f/porting ;-) * considerably simpler locking for __dentry_kill() * fast_dput() is pretty much "dput() sans killing dentry; returns true if we are done, false - if dentry needs killing (in which case dentry will be left locked and refcount is known to be 0). * retain_dentry() not messing with refcounting - called with refcount 0 and ->d_lock held, returns whether we want the dentry retained in cache. * rules for shrink lists are much simpler now - to_shrink_list() puts a locked dentry with zero refcount into a shrink list, no need to guarantee that filesystem containing that dentry won't get shut down before we get to eventual shrink_dentry_list() - it would do the right thing. * ->d_iput() and ->d_release() no longer have weird corner cases when they could get called with parent already killed. That happened to be avoided in the cases where in-kernel instances would bother to work with the parent, but that used to be very brittle and convoluted. Now it's "parent is kept pinned until __dentry_kill() of child is done". * a bunch of other subtle shit is gone (e.g. the logics in shrink_lock_dentry() had rather subtle corner cases, with convoluted reasons why they won't break things - except that in some cases they would, etc.) This stuff lives in git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache2 individual patches are in followups. Help with reviewing and testing would be very welcome - it seems to survive the local beating, but it definitely needs more. Shortlog: Al Viro (22): struct dentry: get rid of randomize_layout idiocy switch nfsd_client_rmdir() to use of simple_recursive_removal() coda_flag_children(): cope with dentries turning negative dentry: switch the lists of children to hlist centralize killing dentry from shrink list get rid of __dget() shrink_dentry_list(): no need to check that dentry refcount is marked dead fast_dput(): having ->d_delete() is not reason to delay refcount decrement fast_dput(): handle underflows gracefully fast_dput(): new rules for refcount __dput_to_list(): do decrement of refcount in the callers Make retain_dentry() neutral with respect to refcounting __dentry_kill(): get consistent rules for victim's refcount dentry_kill(): don't bother with retain_dentry() on slow path Call retain_dentry() with refcount 0 fold the call of retain_dentry() into fast_dput() don't try to cut corners in shrink_lock_dentry() fold dentry_kill() into dput() to_shrink_list(): call only if refcount is 0 switch select_collect{,2}() to use of to_shrink_list() d_prune_aliases(): use a shrink list __dentry_kill(): new locking scheme Diffstat: Documentation/filesystems/porting.rst | 34 +++ arch/powerpc/platforms/cell/spufs/inode.c | 5 +- fs/afs/dynroot.c | 5 +- fs/autofs/expire.c | 7 +- fs/ceph/dir.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/coda/cache.c | 9 +- fs/dcache.c | 409 ++++++++++-------------------- fs/libfs.c | 45 ++-- fs/nfsd/nfsctl.c | 70 +---- fs/notify/fsnotify.c | 2 +- fs/tracefs/inode.c | 34 +-- include/linux/dcache.h | 22 +- 13 files changed, 255 insertions(+), 391 deletions(-) Patch description follows: Part 1 - preparations 01/22) struct dentry: get rid of randomize_layout idiocy. This is beyond ridiculous. There is a reason why that thing is cacheline-aligned... 02/22) nfsd_client_rmdir() and its gut open-code simple_recursive_removal(); converting to calling that cleans the things up in there *and* reduces the amount of places where we touch the list of children, which simplifies the work later in the series. 03/22) more fun caught while looking at the places that go through the lists of children: coda_flag_children() assumes that ->d_lock on parent is enough to prevent children going negative. Ain't so... 04/22) switch the lists of children to hlist. We never bother with accessing the list tail and using hlist saves us a pointer per each dentry. Besides, it ends up more readable that way. Fields used to hold the lists got renamed - d_children/d_sib instead of d_subdirs/d_child. Yes, any out-of-tree code that works with the lists of children gets loudly broken; not hard to fix. 05/22) centralize killing dentry from shrink list There are identical pieces of code in shrink_dentry_list() and shrink_dcache_for_umount(); they would require identical massage through the series below, unifying them into an inlined helper reduces the amount of noise. 06/22) get rid of __dget() fold into the sole remaining caller 07/22) shrink_dentry_list(): no need to check that dentry refcount is marked dead. We won't see DCACHE_MAY_FREE on anything that is *not* dead and checking d_flags is just as cheap as checking refcount. Part 2 - massage of dput() and friends 08/22) fast_dput(): having ->d_delete() is not reason to delay refcount decrement. ->d_delete() is a way for filesystem to tell that dentry is not worth keeping cached. It is not guaranteed to be called every time a dentry has refcount drop down to zero; it is not guaranteed to be called before dentry gets evicted. In other words, it is not suitable for any kind of keeping track of dentry state. None of the in-tree filesystems attempt to use it that way, fortunately. So the contortions done by fast_dput() (as well as dentry_kill()) are not warranted. fast_dput() certainly should treat having ->d_delete() instance as "can't assume we'll be keeping it", but that's not different from the way we treat e.g. DCACHE_DONTCACHE (which is rather similar to making ->d_delete() returns true when called). 09/22) fast_dput(): handle underflows gracefully. If refcount is less than 1, we should just warn, unlock dentry and return true, so that the caller doesn't try to do anything else. Taking care of that leaves the rest of "lockref_put_return() has failed" case equivalent to "decrement refcount and rejoin the normal slow path after the point where we grab ->d_lock". 10/22) fast_dput(): new rules for refcount. Currently the "need caller to do more work" path in fast_dput() has refcount decremented, then, with ->d_lock held and refcount verified to have reached 0 fast_dput() forcibly resets the refcount to 1. Move that resetting refcount to 1 into the callers; later in the series it will be massaged out of existence. 11/22) __dput_to_list(): do decrement of refcount in the callers ... and rename it to to_shrink_list(), seeing that it no longer does dropping any references. 12/22) make retain_dentry() neutral with respect to refcounting. It used to decrement refcount if and only if it returned true. Lift those decrements into the callers. 13/22) __dentry_kill(): get consistent rules for victim's refcount Currently we call it with refcount equal to 1 when called from dentry_kill(); all other callers have it equal to 0. Make it always be called with zero refcount; on this step we just decrement it before the calls in dentry_kill(). That is safe, since all places that care about the value of refcount either do that under ->d_lock or hold a reference to dentry in question. Either is sufficient to prevent observing a dentry immediately prior to __dentry_kill() getting called from dentry_kill(). 14/22) dentry_kill(): don't bother with retain_dentry() on the slow path We have already checked it and dentry used to look not worthy of keeping. The only hard obstacle to evicting dentry is non-zero refcount; everything else is advisory - e.g. memory pressure could evict any dentry found with refcount zero. On the slow path in dentry_kill() we had dropped and regained ->d_lock; we must recheck the refcount, but everything else is not worth bothering with. Note that filesystem can not count upon ->d_delete() being called for dentry - not even once. Again, memory pressure (as well as d_prune_aliases(), or attempted rmdir() of ancestor, or...) will not call ->d_delete() at all. So from the correctness point of view we are fine doing the check only once. And it makes things simpler down the road. The doctor said "To the morgue", so to the morgue it is! 15/22) Call retain_dentry() with refcount 0. Instead of bumping it from 0 to 1, calling retain_dentry(), then decrementing it back to 0 (with ->d_lock held all the way through), just leave refcount at 0 through all of that. It will have a visible effect for ->d_delete() - now it can be called with refcount 0 instead of 1 and it can no longer play silly buggers with dropping/regaining ->d_lock. Not that any in-tree instances tried to (it's pretty hard to get right). Any out-of-tree ones will have to adjust (assuming they need any changes). 16/22) fold the call of retain_dentry() into fast_dput() Calls of retain_dentry() happen immediately after getting false from fast_dput() and getting true from retain_dentry() is treated the same way as non-zero refcount would be treated by fast_dput() - unlock dentry and bugger off. 17/22) don't try to cut corners in shrink_lock_dentry(). That is to say, do *not* treat the ->d_inode or ->d_parent changes as "it's hard, return false; somebody must have grabbed it, so even if has zero refcount, we don't need to bother killing it - final dput() from whoever grabbed it would've done everything". First of all, that is not guaranteed. It might have been dropped by shrink_kill() handling of victim's parent, which would've found it already on a shrink list (ours) and decided that they don't need to put it on their shrink list. What's more, dentry_kill() is doing pretty much the same thing, cutting its own set of corners (it assumes that dentry can't go from positive to negative, so its inode can change but only once and only in one direction). Doing that right allows to get rid of that not-quite-duplication and removes the only reason for re-incrementing refcount before the call of dentry_kill(). Replacement is called lock_for_kill(); called under rcu_read_lock and with ->d_lock held. If it returns false, dentry has non-zero refcount and the same locks are held. If it returns true, dentry has zero refcount and all locks required by __dentry_kill() are taken. Part of __lock_parent() had been lifted into lock_parent() to allow its reuse. Now it's called with rcu_read_lock already held and dentry already unlocked. Note that this is not the final change - locking requirements for __dentry_kill() are going to change later in the series and the set of locks taken by lock_for_kill() will be adjusted. 18/22) fold dentry_kill() into dput(). Not worth keeping separate. 19/22) to_shrink_list(): call only if refcount is 0 The only thing it does if refcount is not zero is d_lru_del(); no point, IMO, seeing that plain dput() does nothing of that sort... Note that 2 of 3 current callers are guaranteed that refcount is 0. 20/22) switch select_collect{,2}() to use of to_shrink_list() Same note about d_lru_del() as in (18/22). 21/22) d_prune_aliases(): use a shrink list Instead of dropping aliases one by one, restarting, etc., just collect them into a shrink list and kill them off in one pass. We don't really need the restarts - one alias can't pin another (directory has only one alias, and couldn't be its own ancestor anyway), so collecting everything that is not busy and taking it out would take care of everything evictable that had been there as we entered the function. And new aliases added while we'd been dropping old ones could just as easily have appeared right as we return to caller... 22/22) __dentry_kill(): new locking scheme Currently we enter __dentry_kill() with parent (along with the victim dentry and victim's inode) held locked. Then we mark dentry refcount as dead call ->d_prune() remove dentry from hash remove it from the parent's list of children unlock the parent, don't need it from that point on detach dentry from inode, unlock dentry and drop the inode (via ->d_iput()) call ->d_release() regain the lock on dentry check if it's on a shrink list (in which case freeing its empty husk has to be left to shrink_dentry_list()) or not (in which case we can free it ourselves). In the former case, mark it as an empty husk, so that shrink_dentry_list() would know it can free the sucker. drop the lock on dentry ... and usually the caller proceeds to drop a reference on the parent, possibly retaking the lock on it. That is painful for a bunch of reasons, starting with the need to take locks out of order, but not limited to that - the parent of positive dentry can change if we drop its ->d_lock, so getting these locks has to be done with care. Moreover, as soon as dentry is out of the parent's list of children, shrink_dcache_for_umount() won't see it anymore, making it appear as if the parent is inexplicably busy. We do work around that by having shrink_dentry_list() decrement the parent's refcount first and put it on shrink list to be evicted once we are done with __dentry_kill() of child, but that may in some cases lead to ->d_iput() on child called after the parent got killed. That doesn't happen in cases where in-tree ->d_iput() instances might want to look at the parent, but that's brittle as hell. Solution: do removal from the parent's list of children in the very end of __dentry_kill(). As the result, the callers do not need to lock the parent and by the time we really need the parent locked, dentry is negative and is guaranteed not to be moved around. It does mean that ->d_prune() will be called with parent not locked. It also means that we might see dentries in process of being torn down while going through the parent's list of children; those dentries will be unhashed, negative and with refcount marked dead. In practice, that's enough for in-tree code that looks through the list of children to do the right thing as-is. Out-of-tree code might need to be adjusted. Calling conventions: __dentry_kill(dentry) is called with dentry->d_lock held, along with ->i_lock of its inode (if any). It either returns the parent (locked, with refcount decremented to 0) or NULL (if there'd been no parent or if refcount decrement for parent hadn't reached 0). lock_for_kill() is adjusted for new requirements - it doesn't touch the parent's ->d_lock at all. Callers adjusted. Note that for dput() we don't need to bother with fast_dput() for the parent - we just need to check retain_dentry() for it, since its ->d_lock is still held since the moment when __dentry_kill() had taken it to remove the victim from the list of children. The kludge with early decrement of parent's refcount in shrink_dentry_list() is no longer needed - shrink_dcache_for_umount() sees the half-killed dentries in the list of children for as long as they are pinning the parent. They are easily recognized and accounted for by select_collect(), so we know we are not done yet. As the result, we always have the expected ordering for ->d_iput()/->d_release() vs. __dentry_kill() of the parent, no exceptions. Moreover, the current rules for shrink lists (one must make sure that shrink_dcache_for_umount() won't happen while any dentries from the superblock in question are on any shrink lists) are gone - shrink_dcache_for_umount() will do the right thing in all cases, taking such dentries out. Their empty husks (memory occupied by struct dentry itself + its external name, if any) will remain on the shrink lists, but they are no obstacles to filesystem shutdown. And such husks will get freed as soon as shrink_dentry_list() of the list they are on gets to them.
diff --git a/fs/dcache.c b/fs/dcache.c index 9f471fdb768b..af0e067f6982 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry) return false; /* retain; LRU fodder */ - dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) @@ -709,7 +708,7 @@ EXPORT_SYMBOL(d_mark_dontcache); * Returns dentry requiring refcount drop, or NULL if we're done. */ static struct dentry *dentry_kill(struct dentry *dentry) - __releases(dentry->d_lock) + __releases(dentry->d_lock) __releases(rcu) { struct inode *inode = dentry->d_inode; struct dentry *parent = NULL; @@ -730,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry) goto slow_positive; } } + rcu_read_unlock(); __dentry_kill(dentry); return parent; @@ -739,9 +739,8 @@ static struct dentry *dentry_kill(struct dentry *dentry) spin_lock(&dentry->d_lock); parent = lock_parent(dentry); got_locks: - if (unlikely(dentry->d_lockref.count != 1)) { - dentry->d_lockref.count--; - } else if (likely(!retain_dentry(dentry))) { + if (likely(dentry->d_lockref.count == 0 && !retain_dentry(dentry))) { + rcu_read_unlock(); __dentry_kill(dentry); return parent; } @@ -751,6 +750,7 @@ static struct dentry *dentry_kill(struct dentry *dentry) if (parent) spin_unlock(&parent->d_lock); spin_unlock(&dentry->d_lock); + rcu_read_unlock(); return NULL; } @@ -768,15 +768,7 @@ static inline bool fast_dput(struct dentry *dentry) unsigned int d_flags; /* - * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put_or_lock". - */ - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) - return lockref_put_or_lock(&dentry->d_lockref); - - /* - * .. otherwise, we can try to just decrement the - * lockref optimistically. + * decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); @@ -830,7 +822,7 @@ static inline bool fast_dput(struct dentry *dentry) */ smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE | DCACHE_DISCONNECTED | DCACHE_DONTCACHE; /* Nothing to do? Dropping the reference was all we needed? */ @@ -855,12 +847,6 @@ static inline bool fast_dput(struct dentry *dentry) return true; } - /* - * Re-get the reference we optimistically dropped. We hold the - * lock, and we just tested that it was zero, so we can just - * set it to 1. - */ - dentry->d_lockref.count = 1; return false; } @@ -903,10 +889,9 @@ void dput(struct dentry *dentry) } /* Slow case: now with the dentry lock held */ - rcu_read_unlock(); - if (likely(retain_dentry(dentry))) { spin_unlock(&dentry->d_lock); + rcu_read_unlock(); return; } @@ -918,14 +903,10 @@ EXPORT_SYMBOL(dput); static void __dput_to_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - /* let the owner of the list it's on deal with it */ - --dentry->d_lockref.count; - } else { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); - if (!--dentry->d_lockref.count) - d_shrink_add(dentry, list); + d_shrink_add(dentry, list); } } @@ -1191,7 +1172,7 @@ void shrink_dentry_list(struct list_head *list) rcu_read_unlock(); d_shrink_del(dentry); parent = dentry->d_parent; - if (parent != dentry) + if (parent != dentry && !--parent->d_lockref.count) __dput_to_list(parent, list); __dentry_kill(dentry); } @@ -1638,7 +1619,8 @@ void shrink_dcache_parent(struct dentry *parent) } else { rcu_read_unlock(); parent = data.victim->d_parent; - if (parent != data.victim) + if (parent != data.victim && + !--parent->d_lockref.count) __dput_to_list(parent, &data.dispose); __dentry_kill(data.victim); }