diff mbox series

[v3,4/6] inode: port __I_NEW to var event

Message ID 20240823-work-i_state-v3-4-5cd5fd207a57@kernel.org (mailing list archive)
State New
Headers show
Series inode: turn i_state into u32 | expand

Commit Message

Christian Brauner Aug. 23, 2024, 12:47 p.m. UTC
Port the __I_NEW mechanism to use the new var event mechanism.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
I'm not fully convinced that READ_ONCE() in wait_on_inode() is
sufficient when combined with smp_mb() before wake_up_var(). Maybe we
need smp_store_release() on inode->i_state before smp_mb() and paired
with smp_load_acquire() in wait_on_inode().
---
 fs/bcachefs/fs.c          | 10 ++++++----
 fs/dcache.c               |  7 ++++++-
 fs/inode.c                | 32 ++++++++++++++++++++++++--------
 include/linux/writeback.h |  3 ++-
 4 files changed, 38 insertions(+), 14 deletions(-)

Comments

Jan Kara Sept. 6, 2024, 1:30 p.m. UTC | #1
On Fri 23-08-24 14:47:38, Christian Brauner wrote:
> Port the __I_NEW mechanism to use the new var event mechanism.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> I'm not fully convinced that READ_ONCE() in wait_on_inode() is
> sufficient when combined with smp_mb() before wake_up_var(). Maybe we
> need smp_store_release() on inode->i_state before smp_mb() and paired
> with smp_load_acquire() in wait_on_inode().
> ---
>  fs/bcachefs/fs.c          | 10 ++++++----
>  fs/dcache.c               |  7 ++++++-
>  fs/inode.c                | 32 ++++++++++++++++++++++++--------
>  include/linux/writeback.h |  3 ++-
>  4 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
> index 94c392abef65..c0900c0c0f8a 100644
> --- a/fs/bcachefs/fs.c
> +++ b/fs/bcachefs/fs.c
> @@ -1644,14 +1644,16 @@ void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
>  				break;
>  			}
>  		} else if (clean_pass && this_pass_clean) {
> -			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
> -			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
> +			struct wait_bit_queue_entry wqe;
> +			struct wait_queue_head *wq_head;
>  
> -			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
> +			wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
> +			prepare_to_wait_event(wq_head, &wqe.wq_entry,
> +					      TASK_UNINTERRUPTIBLE);
>  			mutex_unlock(&c->vfs_inodes_lock);
>  
>  			schedule();
> -			finish_wait(wq, &wait.wq_entry);
> +			finish_wait(wq_head, &wqe.wq_entry);

The opencoded waits are somewhat ugly. I understand this is because of
c->vfs_inodes_lock but perhaps we could introduce wait_on_inode() with some
macro magic (to do what we need before going to sleep) to make it easier?

> diff --git a/fs/inode.c b/fs/inode.c
> index 877c64a1bf63..37f20c7c2f72 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -734,7 +734,13 @@ static void evict(struct inode *inode)
>  	 * used as an indicator whether blocking on it is safe.
>  	 */
>  	spin_lock(&inode->i_lock);
> -	wake_up_bit(&inode->i_state, __I_NEW);
> +	/*
> +	 * Pairs with the barrier in prepare_to_wait_event() to make sure
> +	 * ___wait_var_event() either sees the bit cleared or
> +	 * waitqueue_active() check in wake_up_var() sees the waiter.
> +	 */
> +	smp_mb();

Why did you add smp_mb() here? We wake up on inode state change which has
happened quite some time ago and before several things guaranteeing a
barrier...

> +	inode_wake_up_bit(inode, __I_NEW);
>  	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
>  	spin_unlock(&inode->i_lock);
>  
...
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 56b85841ae4c..8f651bb0a1a5 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -200,7 +200,8 @@ void inode_io_list_del(struct inode *inode);
>  /* writeback.h requires fs.h; it, too, is not included from here. */
>  static inline void wait_on_inode(struct inode *inode)
>  {
> -	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
> +	wait_var_event(inode_state_wait_address(inode, __I_NEW),
> +		       !(READ_ONCE(inode->i_state) & I_NEW));
>  }
>  
>  #ifdef CONFIG_CGROUP_WRITEBACK

As far as memory ordering goes, this looks good to me. But READ_ONCE() is
not really guaranteed to be enough in terms of what inode state you're
going to see when racing with i_state updates. i_state updates would have
to happen through WRITE_ONCE() for this to be safe (wrt insane compiler
deoptimizations ;)). Now that's not a new problem so I'm not sure we need
to deal with it in this patch set but it would probably deserve a comment.

								Honza
diff mbox series

Patch

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 94c392abef65..c0900c0c0f8a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1644,14 +1644,16 @@  void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
 				break;
 			}
 		} else if (clean_pass && this_pass_clean) {
-			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
-			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+			struct wait_bit_queue_entry wqe;
+			struct wait_queue_head *wq_head;
 
-			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
+			prepare_to_wait_event(wq_head, &wqe.wq_entry,
+					      TASK_UNINTERRUPTIBLE);
 			mutex_unlock(&c->vfs_inodes_lock);
 
 			schedule();
-			finish_wait(wq, &wait.wq_entry);
+			finish_wait(wq_head, &wqe.wq_entry);
 			goto again;
 		}
 	}
diff --git a/fs/dcache.c b/fs/dcache.c
index 1af75fa68638..894e38cdf4d0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1908,8 +1908,13 @@  void d_instantiate_new(struct dentry *entry, struct inode *inode)
 	__d_instantiate(entry, inode);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW & ~I_CREATING;
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
 	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
+	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(d_instantiate_new);
diff --git a/fs/inode.c b/fs/inode.c
index 877c64a1bf63..37f20c7c2f72 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -734,7 +734,13 @@  static void evict(struct inode *inode)
 	 * used as an indicator whether blocking on it is safe.
 	 */
 	spin_lock(&inode->i_lock);
-	wake_up_bit(&inode->i_state, __I_NEW);
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
+	smp_mb();
+	inode_wake_up_bit(inode, __I_NEW);
 	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	spin_unlock(&inode->i_lock);
 
@@ -1142,8 +1148,13 @@  void unlock_new_inode(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW & ~I_CREATING;
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
 	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
+	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(unlock_new_inode);
@@ -1154,8 +1165,13 @@  void discard_new_inode(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW;
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
 	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
+	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 	iput(inode);
 }
@@ -2344,8 +2360,8 @@  EXPORT_SYMBOL(inode_needs_sync);
  */
 static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
 {
-	wait_queue_head_t *wq;
-	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+	struct wait_bit_queue_entry wqe;
+	struct wait_queue_head *wq_head;
 
 	/*
 	 * Handle racing against evict(), see that routine for more details.
@@ -2356,14 +2372,14 @@  static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
 		return;
 	}
 
-	wq = bit_waitqueue(&inode->i_state, __I_NEW);
-	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+	wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
+	prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->i_lock);
 	rcu_read_unlock();
 	if (is_inode_hash_locked)
 		spin_unlock(&inode_hash_lock);
 	schedule();
-	finish_wait(wq, &wait.wq_entry);
+	finish_wait(wq_head, &wqe.wq_entry);
 	if (is_inode_hash_locked)
 		spin_lock(&inode_hash_lock);
 	rcu_read_lock();
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 56b85841ae4c..8f651bb0a1a5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -200,7 +200,8 @@  void inode_io_list_del(struct inode *inode);
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
 {
-	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
+	wait_var_event(inode_state_wait_address(inode, __I_NEW),
+		       !(READ_ONCE(inode->i_state) & I_NEW));
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK