diff mbox

dax: fix deadlock in __dax_fault

Message ID 20150928101214.GA19114@dastard (mailing list archive)
State Superseded
Headers show

Commit Message

Dave Chinner Sept. 28, 2015, 10:12 a.m. UTC
On Mon, Sep 28, 2015 at 10:59:04AM +1000, Dave Chinner wrote:
> > Does this sound like a reasonable path forward for v4.3?  Dave, and Jan, can
> > you guys can provide guidance and code reviews for the XFS and ext4 bits?
> 
> IMO, it's way too much to get into 4.3. I'd much prefer we revert
> the bad changes in 4.3, and then work towards fixing this for the
> 4.4 merge window. If someone needs this for 4.3, then they can
> backport the 4.4 code to 4.3-stable.

FWIW, here's the first bit of making XFS clear blocks during
allocation. There's a couple of things I need to fix (e.g. moving
the zeroing out of the transaction context but still under the inode
allocation lock and hence atomic with the allocation), it needs to
be split into at least two patches (to split out the
xfs_imap_to_sector() helper), and there's another patch to remove
the complete_unwritten callbacks. I also need to do more audit and
validation on the handling of unwritten extents during get_blocks()
for write faults - the get_blocks() call in this case effectively
becomes an unwritten extent conversion call rather than an
allocation call, and I need to validate that it does what get_block
expects it to do. And that I haven't broken any of the other
get_blocks callers. So still lots to do.

Cheers,

Dave.

Comments

kernel test robot Sept. 28, 2015, 10:23 a.m. UTC | #1
Hi Dave,

[auto build test results on v4.3-rc3 -- if it's inappropriate base, please ignore]

config: xtensa-allyesconfig (attached as .config)
reproduce:
  wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
  chmod +x ~/bin/make.cross
  git checkout 285753fa883fcbeac6b393da338b6e976af57912
  # save the attached .config to linux build tree
  make.cross ARCH=xtensa 

All warnings (new ones prefixed by >>):

   fs/xfs/xfs_iomap.c: In function 'xfs_iomap_write_direct':
>> fs/xfs/xfs_iomap.c:247:5: warning: format '%ld' expects argument of type 'long int', but argument 5 has type 'size_t' [-Wformat=]
        imap->br_blockcount);
        ^
>> fs/xfs/xfs_iomap.c:247:5: warning: format '%ld' expects argument of type 'long int', but argument 6 has type 'sector_t' [-Wformat=]

vim +247 fs/xfs/xfs_iomap.c

   231		/* DAX needs to zero the entire allocated extent here */
   232		if (IS_DAX(VFS_I(ip)) && nimaps) {
   233			sector_t sector = xfs_imap_to_sector(VFS_I(ip), imap, offset);
   234	
   235			ASSERT(!ISUNWRITTEN(imap));
   236			ASSERT(nimaps == 1);
   237			error = dax_clear_blocks(VFS_I(ip),
   238					sector >> (VFS_I(ip)->i_blkbits - BBSHIFT),
   239					XFS_FSB_TO_B(mp, imap->br_blockcount));
   240			if (error) {
   241				xfs_warn(mp,
   242	 "err %d, off/cnt %lld/%ld, sector %ld, bytes %lld, im.stblk %lld, im.stoff %lld, im.blkcnt %lld",
   243					error, offset, count, 
   244					xfs_imap_to_sector(VFS_I(ip), imap, offset),
   245					XFS_FSB_TO_B(mp, imap->br_blockcount),
   246					imap->br_startblock, imap->br_startoff,
 > 247					imap->br_blockcount);
   248				goto out_trans_cancel;
   249			}
   250		}
   251	
   252		error = xfs_trans_commit(tp);
   253		if (error)
   254			goto out_unlock;
   255	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Sept. 28, 2015, 10:23 a.m. UTC | #2
Hi Dave,

[auto build test results on v4.3-rc3 -- if it's inappropriate base, please ignore]

config: mips-allyesconfig (attached as .config)
reproduce:
  wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
  chmod +x ~/bin/make.cross
  git checkout 285753fa883fcbeac6b393da338b6e976af57912
  # save the attached .config to linux build tree
  make.cross ARCH=mips 

All warnings (new ones prefixed by >>):

   fs/xfs/xfs_iomap.c: In function 'xfs_iomap_write_direct':
>> fs/xfs/xfs_iomap.c:242:2: warning: format '%ld' expects argument of type 'long int', but argument 5 has type 'size_t {aka unsigned int}' [-Wformat=]
     "err %d, off/cnt %lld/%ld, sector %ld, bytes %lld, im.stblk %lld, im.stoff %lld, im.blkcnt %lld",
     ^
>> fs/xfs/xfs_iomap.c:242:2: warning: format '%ld' expects argument of type 'long int', but argument 6 has type 'sector_t {aka long long unsigned int}' [-Wformat=]

vim +242 fs/xfs/xfs_iomap.c

   226		 */
   227		error = xfs_bmap_finish(&tp, &free_list, &committed);
   228		if (error)
   229			goto out_bmap_cancel;
   230	
   231		/* DAX needs to zero the entire allocated extent here */
   232		if (IS_DAX(VFS_I(ip)) && nimaps) {
   233			sector_t sector = xfs_imap_to_sector(VFS_I(ip), imap, offset);
   234	
   235			ASSERT(!ISUNWRITTEN(imap));
   236			ASSERT(nimaps == 1);
   237			error = dax_clear_blocks(VFS_I(ip),
   238					sector >> (VFS_I(ip)->i_blkbits - BBSHIFT),
   239					XFS_FSB_TO_B(mp, imap->br_blockcount));
   240			if (error) {
   241				xfs_warn(mp,
 > 242	 "err %d, off/cnt %lld/%ld, sector %ld, bytes %lld, im.stblk %lld, im.stoff %lld, im.blkcnt %lld",
   243					error, offset, count, 
   244					xfs_imap_to_sector(VFS_I(ip), imap, offset),
   245					XFS_FSB_TO_B(mp, imap->br_blockcount),
   246					imap->br_startblock, imap->br_startoff,
   247					imap->br_blockcount);
   248				goto out_trans_cancel;
   249			}
   250		}

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 50ab287..f645587 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -586,27 +586,35 @@  xfs_add_to_ioend(
 	ioend->io_size += bh->b_size;
 }
 
-STATIC void
-xfs_map_buffer(
+sector_t
+xfs_imap_to_sector(
 	struct inode		*inode,
-	struct buffer_head	*bh,
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
-	sector_t		bn;
-	struct xfs_mount	*m = XFS_I(inode)->i_mount;
-	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
-	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
+	xfs_off_t		iomap_offset;
+	xfs_daddr_t		iomap_bn;
 
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
-	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-	      ((offset - iomap_offset) >> inode->i_blkbits);
+	iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+	iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
 
-	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
+	return iomap_bn + BTOBB(offset - iomap_offset);
+}
 
-	bh->b_blocknr = bn;
+STATIC void
+xfs_map_buffer(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	bh->b_blocknr = xfs_imap_to_sector(inode, imap, offset) >>
+						(inode->i_blkbits - BBSHIFT);
+	ASSERT(bh->b_blocknr || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 	set_buffer_mapped(bh);
 }
 
@@ -617,11 +625,7 @@  xfs_map_at_offset(
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
-	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
 	xfs_map_buffer(inode, bh, imap, offset);
-	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
 }
@@ -1396,7 +1400,8 @@  __xfs_get_blocks(
 	if (create &&
 	    (!nimaps ||
 	     (imap.br_startblock == HOLESTARTBLOCK ||
-	      imap.br_startblock == DELAYSTARTBLOCK))) {
+	      imap.br_startblock == DELAYSTARTBLOCK) ||
+	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
 		if (direct || xfs_get_extsz_hint(ip)) {
 			/*
 			 * Drop the ilock in preparation for starting the block
@@ -1441,6 +1446,9 @@  __xfs_get_blocks(
 		goto out_unlock;
 	}
 
+	if (IS_DAX(inode))
+		ASSERT(!ISUNWRITTEN(&imap));
+
 	/* trim mapping down to size requested */
 	if (direct || size > (1 << inode->i_blkbits))
 		xfs_map_trim_size(inode, iblock, bh_result,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 86afd1a..ede1025 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,6 +18,8 @@ 
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
 
+struct xfs_bmbt_irec;
+
 extern mempool_t *xfs_ioend_pool;
 
 /*
@@ -62,4 +64,7 @@  void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
 
+sector_t xfs_imap_to_sector(struct inode *inode, struct xfs_bmbt_irec *imap,
+			    xfs_off_t offset);
+
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033..277cd82 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -131,6 +131,7 @@  xfs_iomap_write_direct(
 	uint		qblocks, resblks, resrtextents;
 	int		committed;
 	int		error;
+	int		bmapi_flags = XFS_BMAPI_PREALLOC;
 
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
@@ -196,13 +197,26 @@  xfs_iomap_write_direct(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
+	 * For DAX, we do not allocate unwritten extents, but instead we zero
+	 * the block before we commit the transaction. Hence if we are mapping
+	 * unwritten extents here, we need to convert them to written so that we
+	 * don't need an unwritten extent callback here.
+	 *
+	 * Block zeroing for DAX is effectively a memset operation and so should
+	 * not block on anything when we call it after the block allocation or
+	 * conversion before we commit the transaction.
+	 */
+	if (IS_DAX(VFS_I(ip)))
+		bmapi_flags = XFS_BMAPI_CONVERT;
+
+	/*
 	 * From this point onwards we overwrite the imap pointer that the
 	 * caller gave to us.
 	 */
 	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-				XFS_BMAPI_PREALLOC, &firstfsb, 0,
+				bmapi_flags, &firstfsb, 0,
 				imap, &nimaps, &free_list);
 	if (error)
 		goto out_bmap_cancel;
@@ -213,6 +227,28 @@  xfs_iomap_write_direct(
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error)
 		goto out_bmap_cancel;
+
+	/* DAX needs to zero the entire allocated extent here */
+	if (IS_DAX(VFS_I(ip)) && nimaps) {
+		sector_t sector = xfs_imap_to_sector(VFS_I(ip), imap, offset);
+
+		ASSERT(!ISUNWRITTEN(imap));
+		ASSERT(nimaps == 1);
+		error = dax_clear_blocks(VFS_I(ip),
+				sector >> (VFS_I(ip)->i_blkbits - BBSHIFT),
+				XFS_FSB_TO_B(mp, imap->br_blockcount));
+		if (error) {
+			xfs_warn(mp,
+ "err %d, off/cnt %lld/%ld, sector %ld, bytes %lld, im.stblk %lld, im.stoff %lld, im.blkcnt %lld",
+				error, offset, count, 
+				xfs_imap_to_sector(VFS_I(ip), imap, offset),
+				XFS_FSB_TO_B(mp, imap->br_blockcount),
+				imap->br_startblock, imap->br_startoff,
+				imap->br_blockcount);
+			goto out_trans_cancel;
+		}
+	}
+
 	error = xfs_trans_commit(tp);
 	if (error)
 		goto out_unlock;