diff mbox series

[v7,5/5] gfs2: Fix iomap write page reclaim deadlock

Message ID 20190429220934.10415-6-agruenba@redhat.com (mailing list archive)
State New, archived
Headers show
Series iomap and gfs2 fixes | expand

Commit Message

Andreas Gruenbacher April 29, 2019, 10:09 p.m. UTC
Since commit 64bc06bb32ee ("gfs2: iomap buffered write support"), gfs2 is doing
buffered writes by starting a transaction in iomap_begin, writing a range of
pages, and ending that transaction in iomap_end.  This approach suffers from
two problems:

  (1) Any allocations necessary for the write are done in iomap_begin, so when
  the data aren't journaled, there is no need for keeping the transaction open
  until iomap_end.

  (2) Transactions keep the gfs2 log flush lock held.  When
  iomap_file_buffered_write calls balance_dirty_pages, this can end up calling
  gfs2_write_inode, which will try to flush the log.  This requires taking the
  log flush lock which is already held, resulting in a deadlock.

Fix both of these issues by not keeping transactions open from iomap_begin to
iomap_end.  Instead, start a small transaction in page_prepare and end it in
page_done when necessary.

Reported-by: Edwin Török <edvin.torok@citrix.com>
Fixes: 64bc06bb32ee ("gfs2: iomap buffered write support")
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/aops.c | 14 +++++---
 fs/gfs2/bmap.c | 88 +++++++++++++++++++++++++++-----------------------
 2 files changed, 58 insertions(+), 44 deletions(-)

Comments

Darrick J. Wong April 30, 2019, 3:32 p.m. UTC | #1
On Tue, Apr 30, 2019 at 12:09:34AM +0200, Andreas Gruenbacher wrote:
> Since commit 64bc06bb32ee ("gfs2: iomap buffered write support"), gfs2 is doing
> buffered writes by starting a transaction in iomap_begin, writing a range of
> pages, and ending that transaction in iomap_end.  This approach suffers from
> two problems:
> 
>   (1) Any allocations necessary for the write are done in iomap_begin, so when
>   the data aren't journaled, there is no need for keeping the transaction open
>   until iomap_end.
> 
>   (2) Transactions keep the gfs2 log flush lock held.  When
>   iomap_file_buffered_write calls balance_dirty_pages, this can end up calling
>   gfs2_write_inode, which will try to flush the log.  This requires taking the
>   log flush lock which is already held, resulting in a deadlock.

/me wonders how holding the log flush lock doesn't seriously limit
performance, but gfs2 isn't my fight so I'll set that aside and assume
that a patch S-o-B'd by both maintainers is ok. :)

How should we merge this patch #5?  It doesn't touch fs/iomap.c itself,
so do you want me to pull it into the iomap branch along with the
previous four patches?  That would be fine with me (and easier than a
multi-tree merge mess)...

--D

> 
> Fix both of these issues by not keeping transactions open from iomap_begin to
> iomap_end.  Instead, start a small transaction in page_prepare and end it in
> page_done when necessary.
> 
> Reported-by: Edwin Török <edvin.torok@citrix.com>
> Fixes: 64bc06bb32ee ("gfs2: iomap buffered write support")
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
>  fs/gfs2/aops.c | 14 +++++---
>  fs/gfs2/bmap.c | 88 +++++++++++++++++++++++++++-----------------------
>  2 files changed, 58 insertions(+), 44 deletions(-)
> 
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index 05dd78f4b2b3..6210d4429d84 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -649,7 +649,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
>   */
>  void adjust_fs_space(struct inode *inode)
>  {
> -	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
> +	struct gfs2_sbd *sdp = GFS2_SB(inode);
>  	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
>  	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
>  	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
> @@ -657,10 +657,13 @@ void adjust_fs_space(struct inode *inode)
>  	struct buffer_head *m_bh, *l_bh;
>  	u64 fs_total, new_free;
>  
> +	if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0)
> +		return;
> +
>  	/* Total up the file system space, according to the latest rindex. */
>  	fs_total = gfs2_ri_total(sdp);
>  	if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
> -		return;
> +		goto out;
>  
>  	spin_lock(&sdp->sd_statfs_spin);
>  	gfs2_statfs_change_in(m_sc, m_bh->b_data +
> @@ -675,11 +678,14 @@ void adjust_fs_space(struct inode *inode)
>  	gfs2_statfs_change(sdp, new_free, new_free, 0);
>  
>  	if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
> -		goto out;
> +		goto out2;
>  	update_statfs(sdp, m_bh, l_bh);
>  	brelse(l_bh);
> -out:
> +out2:
>  	brelse(m_bh);
> +out:
> +	sdp->sd_rindex_uptodate = 0;
> +	gfs2_trans_end(sdp);
>  }
>  
>  /**
> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
> index aa014725f84a..27c82f4aaf32 100644
> --- a/fs/gfs2/bmap.c
> +++ b/fs/gfs2/bmap.c
> @@ -991,17 +991,28 @@ static void gfs2_write_unlock(struct inode *inode)
>  	gfs2_glock_dq_uninit(&ip->i_gh);
>  }
>  
> +static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
> +				   unsigned len, struct iomap *iomap)
> +{
> +	struct gfs2_sbd *sdp = GFS2_SB(inode);
> +
> +	return gfs2_trans_begin(sdp, RES_DINODE + (len >> inode->i_blkbits), 0);
> +}
> +
>  static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
>  				 unsigned copied, struct page *page,
>  				 struct iomap *iomap)
>  {
>  	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_sbd *sdp = GFS2_SB(inode);
>  
> -	if (page)
> +	if (page && !gfs2_is_stuffed(ip))
>  		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
> +	gfs2_trans_end(sdp);
>  }
>  
>  static const struct iomap_page_ops gfs2_iomap_page_ops = {
> +	.page_prepare = gfs2_iomap_page_prepare,
>  	.page_done = gfs2_iomap_page_done,
>  };
>  
> @@ -1057,31 +1068,45 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
>  	if (alloc_required)
>  		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
>  
> -	ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
> -	if (ret)
> -		goto out_trans_fail;
> +	if (unstuff || iomap->type == IOMAP_HOLE) {
> +		struct gfs2_trans *tr;
>  
> -	if (unstuff) {
> -		ret = gfs2_unstuff_dinode(ip, NULL);
> +		ret = gfs2_trans_begin(sdp, rblocks,
> +				       iomap->length >> inode->i_blkbits);
>  		if (ret)
> -			goto out_trans_end;
> -		release_metapath(mp);
> -		ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
> -				     flags, iomap, mp);
> -		if (ret)
> -			goto out_trans_end;
> -	}
> +			goto out_trans_fail;
>  
> -	if (iomap->type == IOMAP_HOLE) {
> -		ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
> -		if (ret) {
> -			gfs2_trans_end(sdp);
> -			gfs2_inplace_release(ip);
> -			punch_hole(ip, iomap->offset, iomap->length);
> -			goto out_qunlock;
> +		if (unstuff) {
> +			ret = gfs2_unstuff_dinode(ip, NULL);
> +			if (ret)
> +				goto out_trans_end;
> +			release_metapath(mp);
> +			ret = gfs2_iomap_get(inode, iomap->offset,
> +					     iomap->length, flags, iomap, mp);
> +			if (ret)
> +				goto out_trans_end;
> +		}
> +
> +		if (iomap->type == IOMAP_HOLE) {
> +			ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
> +			if (ret) {
> +				gfs2_trans_end(sdp);
> +				gfs2_inplace_release(ip);
> +				punch_hole(ip, iomap->offset, iomap->length);
> +				goto out_qunlock;
> +			}
>  		}
> +
> +		tr = current->journal_info;
> +		if (tr->tr_num_buf_new)
> +			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> +		else
> +			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[0]);
> +
> +		gfs2_trans_end(sdp);
>  	}
> -	if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
> +
> +	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
>  		iomap->page_ops = &gfs2_iomap_page_ops;
>  	return 0;
>  
> @@ -1121,10 +1146,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
>  		    iomap->type != IOMAP_MAPPED)
>  			ret = -ENOTBLK;
>  	}
> -	if (!ret) {
> -		get_bh(mp.mp_bh[0]);
> -		iomap->private = mp.mp_bh[0];
> -	}
>  	release_metapath(&mp);
>  	trace_gfs2_iomap_end(ip, iomap, ret);
>  	return ret;
> @@ -1135,27 +1156,16 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
>  {
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  	struct gfs2_sbd *sdp = GFS2_SB(inode);
> -	struct gfs2_trans *tr = current->journal_info;
> -	struct buffer_head *dibh = iomap->private;
>  
>  	if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
>  		goto out;
>  
> -	if (iomap->type != IOMAP_INLINE) {
> +	if (!gfs2_is_stuffed(ip))
>  		gfs2_ordered_add_inode(ip);
>  
> -		if (tr->tr_num_buf_new)
> -			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> -		else
> -			gfs2_trans_add_meta(ip->i_gl, dibh);
> -	}
> -
> -	if (inode == sdp->sd_rindex) {
> +	if (inode == sdp->sd_rindex)
>  		adjust_fs_space(inode);
> -		sdp->sd_rindex_uptodate = 0;
> -	}
>  
> -	gfs2_trans_end(sdp);
>  	gfs2_inplace_release(ip);
>  
>  	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
> @@ -1175,8 +1185,6 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
>  	gfs2_write_unlock(inode);
>  
>  out:
> -	if (dibh)
> -		brelse(dibh);
>  	return 0;
>  }
>  
> -- 
> 2.20.1
>
Andreas Gruenbacher April 30, 2019, 3:39 p.m. UTC | #2
On Tue, 30 Apr 2019 at 17:33, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Tue, Apr 30, 2019 at 12:09:34AM +0200, Andreas Gruenbacher wrote:
> > Since commit 64bc06bb32ee ("gfs2: iomap buffered write support"), gfs2 is doing
> > buffered writes by starting a transaction in iomap_begin, writing a range of
> > pages, and ending that transaction in iomap_end.  This approach suffers from
> > two problems:
> >
> >   (1) Any allocations necessary for the write are done in iomap_begin, so when
> >   the data aren't journaled, there is no need for keeping the transaction open
> >   until iomap_end.
> >
> >   (2) Transactions keep the gfs2 log flush lock held.  When
> >   iomap_file_buffered_write calls balance_dirty_pages, this can end up calling
> >   gfs2_write_inode, which will try to flush the log.  This requires taking the
> >   log flush lock which is already held, resulting in a deadlock.
>
> /me wonders how holding the log flush lock doesn't seriously limit
> performance, but gfs2 isn't my fight so I'll set that aside and assume
> that a patch S-o-B'd by both maintainers is ok. :)

This only affects inline and journaled data, not standard writes, so
it's not quite as bad as it looks.

> How should we merge this patch #5?  It doesn't touch fs/iomap.c itself,
> so do you want me to pull it into the iomap branch along with the
> previous four patches?  That would be fine with me (and easier than a
> multi-tree merge mess)...

I'd prefer to get this merged via the gfs2 tree once the iomap fixes
have been pulled.

Thanks,
Andreas
Darrick J. Wong April 30, 2019, 3:47 p.m. UTC | #3
On Tue, Apr 30, 2019 at 05:39:28PM +0200, Andreas Gruenbacher wrote:
> On Tue, 30 Apr 2019 at 17:33, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Apr 30, 2019 at 12:09:34AM +0200, Andreas Gruenbacher wrote:
> > > Since commit 64bc06bb32ee ("gfs2: iomap buffered write support"), gfs2 is doing
> > > buffered writes by starting a transaction in iomap_begin, writing a range of
> > > pages, and ending that transaction in iomap_end.  This approach suffers from
> > > two problems:
> > >
> > >   (1) Any allocations necessary for the write are done in iomap_begin, so when
> > >   the data aren't journaled, there is no need for keeping the transaction open
> > >   until iomap_end.
> > >
> > >   (2) Transactions keep the gfs2 log flush lock held.  When
> > >   iomap_file_buffered_write calls balance_dirty_pages, this can end up calling
> > >   gfs2_write_inode, which will try to flush the log.  This requires taking the
> > >   log flush lock which is already held, resulting in a deadlock.
> >
> > /me wonders how holding the log flush lock doesn't seriously limit
> > performance, but gfs2 isn't my fight so I'll set that aside and assume
> > that a patch S-o-B'd by both maintainers is ok. :)
> 
> This only affects inline and journaled data, not standard writes, so
> it's not quite as bad as it looks.

Ah, ok.

> > How should we merge this patch #5?  It doesn't touch fs/iomap.c itself,
> > so do you want me to pull it into the iomap branch along with the
> > previous four patches?  That would be fine with me (and easier than a
> > multi-tree merge mess)...
> 
> I'd prefer to get this merged via the gfs2 tree once the iomap fixes
> have been pulled.

Ok, I'll take the first four patches through the iomap branch and cc you
on the pull request.

--D

> 
> Thanks,
> Andreas
Andreas Grünbacher April 30, 2019, 4:15 p.m. UTC | #4
Am Di., 30. Apr. 2019 um 17:48 Uhr schrieb Darrick J. Wong
<darrick.wong@oracle.com>:
> Ok, I'll take the first four patches through the iomap branch and cc you
> on the pull request.

Ok great, thanks.

Andreas
diff mbox series

Patch

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 05dd78f4b2b3..6210d4429d84 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -649,7 +649,7 @@  static int gfs2_readpages(struct file *file, struct address_space *mapping,
  */
 void adjust_fs_space(struct inode *inode)
 {
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
 	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -657,10 +657,13 @@  void adjust_fs_space(struct inode *inode)
 	struct buffer_head *m_bh, *l_bh;
 	u64 fs_total, new_free;
 
+	if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0)
+		return;
+
 	/* Total up the file system space, according to the latest rindex. */
 	fs_total = gfs2_ri_total(sdp);
 	if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
-		return;
+		goto out;
 
 	spin_lock(&sdp->sd_statfs_spin);
 	gfs2_statfs_change_in(m_sc, m_bh->b_data +
@@ -675,11 +678,14 @@  void adjust_fs_space(struct inode *inode)
 	gfs2_statfs_change(sdp, new_free, new_free, 0);
 
 	if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
-		goto out;
+		goto out2;
 	update_statfs(sdp, m_bh, l_bh);
 	brelse(l_bh);
-out:
+out2:
 	brelse(m_bh);
+out:
+	sdp->sd_rindex_uptodate = 0;
+	gfs2_trans_end(sdp);
 }
 
 /**
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index aa014725f84a..27c82f4aaf32 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -991,17 +991,28 @@  static void gfs2_write_unlock(struct inode *inode)
 	gfs2_glock_dq_uninit(&ip->i_gh);
 }
 
+static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
+				   unsigned len, struct iomap *iomap)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	return gfs2_trans_begin(sdp, RES_DINODE + (len >> inode->i_blkbits), 0);
+}
+
 static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
 				 unsigned copied, struct page *page,
 				 struct iomap *iomap)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 
-	if (page)
+	if (page && !gfs2_is_stuffed(ip))
 		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
+	gfs2_trans_end(sdp);
 }
 
 static const struct iomap_page_ops gfs2_iomap_page_ops = {
+	.page_prepare = gfs2_iomap_page_prepare,
 	.page_done = gfs2_iomap_page_done,
 };
 
@@ -1057,31 +1068,45 @@  static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 	if (alloc_required)
 		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
 
-	ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
-	if (ret)
-		goto out_trans_fail;
+	if (unstuff || iomap->type == IOMAP_HOLE) {
+		struct gfs2_trans *tr;
 
-	if (unstuff) {
-		ret = gfs2_unstuff_dinode(ip, NULL);
+		ret = gfs2_trans_begin(sdp, rblocks,
+				       iomap->length >> inode->i_blkbits);
 		if (ret)
-			goto out_trans_end;
-		release_metapath(mp);
-		ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
-				     flags, iomap, mp);
-		if (ret)
-			goto out_trans_end;
-	}
+			goto out_trans_fail;
 
-	if (iomap->type == IOMAP_HOLE) {
-		ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
-		if (ret) {
-			gfs2_trans_end(sdp);
-			gfs2_inplace_release(ip);
-			punch_hole(ip, iomap->offset, iomap->length);
-			goto out_qunlock;
+		if (unstuff) {
+			ret = gfs2_unstuff_dinode(ip, NULL);
+			if (ret)
+				goto out_trans_end;
+			release_metapath(mp);
+			ret = gfs2_iomap_get(inode, iomap->offset,
+					     iomap->length, flags, iomap, mp);
+			if (ret)
+				goto out_trans_end;
+		}
+
+		if (iomap->type == IOMAP_HOLE) {
+			ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
+			if (ret) {
+				gfs2_trans_end(sdp);
+				gfs2_inplace_release(ip);
+				punch_hole(ip, iomap->offset, iomap->length);
+				goto out_qunlock;
+			}
 		}
+
+		tr = current->journal_info;
+		if (tr->tr_num_buf_new)
+			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		else
+			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[0]);
+
+		gfs2_trans_end(sdp);
 	}
-	if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
+
+	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
 		iomap->page_ops = &gfs2_iomap_page_ops;
 	return 0;
 
@@ -1121,10 +1146,6 @@  static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 		    iomap->type != IOMAP_MAPPED)
 			ret = -ENOTBLK;
 	}
-	if (!ret) {
-		get_bh(mp.mp_bh[0]);
-		iomap->private = mp.mp_bh[0];
-	}
 	release_metapath(&mp);
 	trace_gfs2_iomap_end(ip, iomap, ret);
 	return ret;
@@ -1135,27 +1156,16 @@  static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_trans *tr = current->journal_info;
-	struct buffer_head *dibh = iomap->private;
 
 	if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
 		goto out;
 
-	if (iomap->type != IOMAP_INLINE) {
+	if (!gfs2_is_stuffed(ip))
 		gfs2_ordered_add_inode(ip);
 
-		if (tr->tr_num_buf_new)
-			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-		else
-			gfs2_trans_add_meta(ip->i_gl, dibh);
-	}
-
-	if (inode == sdp->sd_rindex) {
+	if (inode == sdp->sd_rindex)
 		adjust_fs_space(inode);
-		sdp->sd_rindex_uptodate = 0;
-	}
 
-	gfs2_trans_end(sdp);
 	gfs2_inplace_release(ip);
 
 	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
@@ -1175,8 +1185,6 @@  static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 	gfs2_write_unlock(inode);
 
 out:
-	if (dibh)
-		brelse(dibh);
 	return 0;
 }