Message ID | 20240823-work-i_state-v3-4-5cd5fd207a57@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | inode: turn i_state into u32 | expand |
On Fri 23-08-24 14:47:38, Christian Brauner wrote: > Port the __I_NEW mechanism to use the new var event mechanism. > > Signed-off-by: Christian Brauner <brauner@kernel.org> > --- > I'm not fully convinced that READ_ONCE() in wait_on_inode() is > sufficient when combined with smp_mb() before wake_up_var(). Maybe we > need smp_store_release() on inode->i_state before smp_mb() and paired > with smp_load_acquire() in wait_on_inode(). > --- > fs/bcachefs/fs.c | 10 ++++++---- > fs/dcache.c | 7 ++++++- > fs/inode.c | 32 ++++++++++++++++++++++++-------- > include/linux/writeback.h | 3 ++- > 4 files changed, 38 insertions(+), 14 deletions(-) > > diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c > index 94c392abef65..c0900c0c0f8a 100644 > --- a/fs/bcachefs/fs.c > +++ b/fs/bcachefs/fs.c > @@ -1644,14 +1644,16 @@ void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) > break; > } > } else if (clean_pass && this_pass_clean) { > - wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); > - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); > + struct wait_bit_queue_entry wqe; > + struct wait_queue_head *wq_head; > > - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); > + wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); > + prepare_to_wait_event(wq_head, &wqe.wq_entry, > + TASK_UNINTERRUPTIBLE); > mutex_unlock(&c->vfs_inodes_lock); > > schedule(); > - finish_wait(wq, &wait.wq_entry); > + finish_wait(wq_head, &wqe.wq_entry); The opencoded waits are somewhat ugly. I understand this is because of c->vfs_inodes_lock but perhaps we could introduce wait_on_inode() with some macro magic (to do what we need before going to sleep) to make it easier? > diff --git a/fs/inode.c b/fs/inode.c > index 877c64a1bf63..37f20c7c2f72 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -734,7 +734,13 @@ static void evict(struct inode *inode) > * used as an indicator whether blocking on it is safe. > */ > spin_lock(&inode->i_lock); > - wake_up_bit(&inode->i_state, __I_NEW); > + /* > + * Pairs with the barrier in prepare_to_wait_event() to make sure > + * ___wait_var_event() either sees the bit cleared or > + * waitqueue_active() check in wake_up_var() sees the waiter. > + */ > + smp_mb(); Why did you add smp_mb() here? We wake up on inode state change which has happened quite some time ago and before several things guaranteeing a barrier... > + inode_wake_up_bit(inode, __I_NEW); > BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); > spin_unlock(&inode->i_lock); > ... > diff --git a/include/linux/writeback.h b/include/linux/writeback.h > index 56b85841ae4c..8f651bb0a1a5 100644 > --- a/include/linux/writeback.h > +++ b/include/linux/writeback.h > @@ -200,7 +200,8 @@ void inode_io_list_del(struct inode *inode); > /* writeback.h requires fs.h; it, too, is not included from here. */ > static inline void wait_on_inode(struct inode *inode) > { > - wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); > + wait_var_event(inode_state_wait_address(inode, __I_NEW), > + !(READ_ONCE(inode->i_state) & I_NEW)); > } > > #ifdef CONFIG_CGROUP_WRITEBACK As far as memory ordering goes, this looks good to me. But READ_ONCE() is not really guaranteed to be enough in terms of what inode state you're going to see when racing with i_state updates. i_state updates would have to happen through WRITE_ONCE() for this to be safe (wrt insane compiler deoptimizations ;)). Now that's not a new problem so I'm not sure we need to deal with it in this patch set but it would probably deserve a comment. Honza
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 94c392abef65..c0900c0c0f8a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1644,14 +1644,16 @@ void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) break; } } else if (clean_pass && this_pass_clean) { - wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); mutex_unlock(&c->vfs_inodes_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); goto again; } } diff --git a/fs/dcache.c b/fs/dcache.c index 1af75fa68638..894e38cdf4d0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1908,8 +1908,13 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); diff --git a/fs/inode.c b/fs/inode.c index 877c64a1bf63..37f20c7c2f72 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -734,7 +734,13 @@ static void evict(struct inode *inode) * used as an indicator whether blocking on it is safe. */ spin_lock(&inode->i_lock); - wake_up_bit(&inode->i_state, __I_NEW); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -1142,8 +1148,13 @@ void unlock_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); @@ -1154,8 +1165,13 @@ void discard_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } @@ -2344,8 +2360,8 @@ EXPORT_SYMBOL(inode_needs_sync); */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { - wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. @@ -2356,14 +2372,14 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock return; } - wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 56b85841ae4c..8f651bb0a1a5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -200,7 +200,8 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); + wait_var_event(inode_state_wait_address(inode, __I_NEW), + !(READ_ONCE(inode->i_state) & I_NEW)); } #ifdef CONFIG_CGROUP_WRITEBACK
Port the __I_NEW mechanism to use the new var event mechanism. Signed-off-by: Christian Brauner <brauner@kernel.org> --- I'm not fully convinced that READ_ONCE() in wait_on_inode() is sufficient when combined with smp_mb() before wake_up_var(). Maybe we need smp_store_release() on inode->i_state before smp_mb() and paired with smp_load_acquire() in wait_on_inode(). --- fs/bcachefs/fs.c | 10 ++++++---- fs/dcache.c | 7 ++++++- fs/inode.c | 32 ++++++++++++++++++++++++-------- include/linux/writeback.h | 3 ++- 4 files changed, 38 insertions(+), 14 deletions(-)