Message ID | 20180523144357.18985-23-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, May 23, 2018 at 04:43:45PM +0200, Christoph Hellwig wrote: > From: Dave Chinner <dchinner@redhat.com> > ... > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 5dd09e83c81c..a50f69c2c602 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c ... > @@ -845,85 +826,81 @@ xfs_writepage_map( > { > LIST_HEAD(submit_list); > struct xfs_ioend *ioend, *next; > - struct buffer_head *bh, *head; > + struct buffer_head *bh; > ssize_t len = i_blocksize(inode); > - uint64_t offset; > int error = 0; > int count = 0; > - int uptodate = 1; > - unsigned int new_type; > + bool uptodate = true; > + loff_t file_offset; /* file offset of page */ > + unsigned poffset; /* offset into page */ > > - bh = head = page_buffers(page); > - offset = page_offset(page); > - do { > - if (offset >= end_offset) > + /* > + * Walk the blocks on the page, and we we run off then end of the > + * current map or find the current map invalid, grab a new one. > + * We only use bufferheads here to check per-block state - they no > + * longer control the iteration through the page. This allows us to > + * replace the bufferhead with some other state tracking mechanism in > + * future. > + */ > + file_offset = page_offset(page); > + bh = page_buffers(page); > + for (poffset = 0; > + poffset < PAGE_SIZE; > + poffset += len, file_offset += len, bh = bh->b_this_page) { > + /* past the range we are writing, so nothing more to write. */ > + if (file_offset >= end_offset) > break; > - if (!buffer_uptodate(bh)) > - uptodate = 0; > > /* > - * set_page_dirty dirties all buffers in a page, independent > - * of their state. The dirty state however is entirely > - * meaningless for holes (!mapped && uptodate), so skip > - * buffers covering holes here. > + * Block does not contain valid data, skip it, mark the current > + * map as invalid because we have a discontiguity. This ensures > + * we put subsequent writeable buffers into a new ioend. > */ > - if (!buffer_mapped(bh) && buffer_uptodate(bh)) { > - wpc->imap_valid = false; > - continue; > - } > - > - if (buffer_unwritten(bh)) > - new_type = XFS_IO_UNWRITTEN; > - else if (buffer_delay(bh)) > - new_type = XFS_IO_DELALLOC; > - else if (buffer_uptodate(bh)) > - new_type = XFS_IO_OVERWRITE; > - else { > + if (!buffer_uptodate(bh)) { > if (PageUptodate(page)) > ASSERT(buffer_mapped(bh)); > - /* > - * This buffer is not uptodate and will not be > - * written to disk. Ensure that we will put any > - * subsequent writeable buffers into a new > - * ioend. > - */ > + uptodate = false; > wpc->imap_valid = false; > continue; > } > > - if (xfs_is_reflink_inode(XFS_I(inode))) { > - error = xfs_map_cow(wpc, inode, offset, &new_type); > - if (error) > - goto out; > - } > - > - if (wpc->io_type != new_type) { > - wpc->io_type = new_type; > - wpc->imap_valid = false; > - } > - > + /* Check to see if current map spans this file offset */ > if (wpc->imap_valid) > wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, > - offset); > + file_offset); > + /* > + * If we don't have a valid map, now it's time to get a new one > + * for this offset. This will convert delayed allocations > + * (including COW ones) into real extents. If we return without > + * a valid map, it means we landed in a hole and we skip the > + * block. > + */ > if (!wpc->imap_valid) { > - error = xfs_map_blocks(inode, offset, &wpc->imap, > - wpc->io_type); > + error = xfs_map_blocks(inode, file_offset, &wpc->imap, > + &wpc->io_type); > if (error) > goto out; > wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, > - offset); > + file_offset); > } > - if (wpc->imap_valid) { > - lock_buffer(bh); > - if (wpc->io_type != XFS_IO_OVERWRITE) > - xfs_map_at_offset(inode, bh, &wpc->imap, offset); > - xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); > - count++; > + > + if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) { > + /* > + * set_page_dirty dirties all buffers in a page, independent > + * of their state. The dirty state however is entirely > + * meaningless for holes (!mapped && uptodate), so check we did > + * have a buffer covering a hole here and continue. > + */ The comment above doesn't make much sense given that we don't check for anything here and just continue the loop. That aside, the concern I had with this patch when it was last posted is that it indirectly dropped the error/consistency check between page state and extent state provided by the XFS_BMAPI_DELALLOC flag. What was historically an accounting/reservation issue was turned into something like this by XFS_BMAPI_DELALLOC: # xfs_io -c "pwrite 0 4k" -c fsync /mnt/file wrote 4096/4096 bytes at offset 0 4 KiB, 1 ops; 0.0041 sec (974.184 KiB/sec and 243.5460 ops/sec) fsync: Input/output error As of this patch, that same error condition now behaves something like this: [root@localhost ~]# xfs_io -c "pwrite 0 4k" -c fsync /mnt/file wrote 4096/4096 bytes at offset 0 4 KiB, 1 ops; 0.0029 sec (1.325 MiB/sec and 339.2130 ops/sec) [root@localhost ~]# ls -al /mnt/file -rw-r--r--. 1 root root 4096 May 24 08:27 /mnt/file [root@localhost ~]# umount /mnt ; mount /dev/test/scratch /mnt/ [root@localhost ~]# ls -al /mnt/file -rw-r--r--. 1 root root 0 May 24 08:27 /mnt/file So our behavior has changed from forced block allocation (violating reservation) and writing the data, to instead return an error, and now to silently skip the page. I suppose there are situations (i.e., races with truncate) where a hole is valid and the correct behavior is to skip the page, and this is admittedly an error condition that "should never happen," but can we at least add an assert somewhere in this series that ensures if uptodate data maps over a hole that the associated block offset is beyond EOF (or something of that nature)? Brian > + continue; > } > > - } while (offset += len, ((bh = bh->b_this_page) != head)); > + lock_buffer(bh); > + xfs_map_at_offset(inode, bh, &wpc->imap, file_offset); > + xfs_add_to_ioend(inode, bh, file_offset, wpc, wbc, &submit_list); > + count++; > + } > > - if (uptodate && bh == head) > + if (uptodate && poffset == PAGE_SIZE) > SetPageUptodate(page); > > ASSERT(wpc->ioend || list_empty(&submit_list)); > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > index 69346d460dfa..b2ef5b661761 100644 > --- a/fs/xfs/xfs_aops.h > +++ b/fs/xfs/xfs_aops.h > @@ -29,6 +29,7 @@ enum { > XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ > XFS_IO_OVERWRITE, /* covers already allocated extent */ > XFS_IO_COW, /* covers copy-on-write extent */ > + XFS_IO_HOLE, /* covers region without any block allocation */ > }; > > #define XFS_IO_TYPES \ > @@ -36,7 +37,8 @@ enum { > { XFS_IO_DELALLOC, "delalloc" }, \ > { XFS_IO_UNWRITTEN, "unwritten" }, \ > { XFS_IO_OVERWRITE, "overwrite" }, \ > - { XFS_IO_COW, "CoW" } > + { XFS_IO_COW, "CoW" }, \ > + { XFS_IO_HOLE, "hole" } > > /* > * Structure for buffered I/O completions. > -- > 2.17.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > + if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) { > > + /* > > + * set_page_dirty dirties all buffers in a page, independent > > + * of their state. The dirty state however is entirely > > + * meaningless for holes (!mapped && uptodate), so check we did > > + * have a buffer covering a hole here and continue. > > + */ > > The comment above doesn't make much sense given that we don't check for > anything here and just continue the loop. It gets removed in the last patch of the original series when we kill buffer heads. But I can fold the removal into this patch as well. > That aside, the concern I had with this patch when it was last posted is > that it indirectly dropped the error/consistency check between page > state and extent state provided by the XFS_BMAPI_DELALLOC flag. What was > historically an accounting/reservation issue was turned into something > like this by XFS_BMAPI_DELALLOC: > > # xfs_io -c "pwrite 0 4k" -c fsync /mnt/file > wrote 4096/4096 bytes at offset 0 > 4 KiB, 1 ops; 0.0041 sec (974.184 KiB/sec and 243.5460 ops/sec) > fsync: Input/output error What is that issue that gets you an I/O error on a 4k write? That is what is missing in the above reproducer? > As of this patch, that same error condition now behaves something like > this: > > [root@localhost ~]# xfs_io -c "pwrite 0 4k" -c fsync /mnt/file > wrote 4096/4096 bytes at offset 0 > 4 KiB, 1 ops; 0.0029 sec (1.325 MiB/sec and 339.2130 ops/sec) > [root@localhost ~]# ls -al /mnt/file > -rw-r--r--. 1 root root 4096 May 24 08:27 /mnt/file > [root@localhost ~]# umount /mnt ; mount /dev/test/scratch /mnt/ > [root@localhost ~]# ls -al /mnt/file > -rw-r--r--. 1 root root 0 May 24 08:27 /mnt/file > > So our behavior has changed from forced block allocation (violating > reservation) and writing the data, to instead return an error, and now > to silently skip the page. We should never, ever allocate space that we didn't have a delalloc reservation for in writepage/writepages. But I agree that we should record and error. I have to admit I'm lost on where we did record the error and why we don't do that now. I'd be happy to fix it. > I suppose there are situations (i.e., races > with truncate) where a hole is valid and the correct behavior is to skip > the page, and this is admittedly an error condition that "should never > happen," but can we at least add an assert somewhere in this series that > ensures if uptodate data maps over a hole that the associated block > offset is beyond EOF (or something of that nature)? We can have plenty of holes in dirty pages. However we should never allocate blocks for them. Fortunately we stop even looking at anything but the extent tree for block status by the end of this series for 4k file systems, and with the next series even for small block sizes, so that whole mismatch is a thing of the past now.
On Thu, May 24, 2018 at 06:53:50PM +0200, Christoph Hellwig wrote: > > > + if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) { > > > + /* > > > + * set_page_dirty dirties all buffers in a page, independent > > > + * of their state. The dirty state however is entirely > > > + * meaningless for holes (!mapped && uptodate), so check we did > > > + * have a buffer covering a hole here and continue. > > > + */ > > > > The comment above doesn't make much sense given that we don't check for > > anything here and just continue the loop. > > It gets removed in the last patch of the original series when we > kill buffer heads. But I can fold the removal into this patch as well. > Ah, I was thinking this patch added that comment when it actually mostly moves it (it does tweak it a bit). Eh, no big deal either way. > > That aside, the concern I had with this patch when it was last posted is > > that it indirectly dropped the error/consistency check between page > > state and extent state provided by the XFS_BMAPI_DELALLOC flag. What was > > historically an accounting/reservation issue was turned into something > > like this by XFS_BMAPI_DELALLOC: > > > > # xfs_io -c "pwrite 0 4k" -c fsync /mnt/file > > wrote 4096/4096 bytes at offset 0 > > 4 KiB, 1 ops; 0.0041 sec (974.184 KiB/sec and 243.5460 ops/sec) > > fsync: Input/output error > > What is that issue that gets you an I/O error on a 4k write? That > is what is missing in the above reproducer? > Sorry... I should have mentioned this is a simulated error and not something that actually occurs right now. You can manufacture it easy enough using the drop_writes error tag and comment out the pagecache truncate code in xfs_file_iomap_end_delalloc(). > > As of this patch, that same error condition now behaves something like > > this: > > > > [root@localhost ~]# xfs_io -c "pwrite 0 4k" -c fsync /mnt/file > > wrote 4096/4096 bytes at offset 0 > > 4 KiB, 1 ops; 0.0029 sec (1.325 MiB/sec and 339.2130 ops/sec) > > [root@localhost ~]# ls -al /mnt/file > > -rw-r--r--. 1 root root 4096 May 24 08:27 /mnt/file > > [root@localhost ~]# umount /mnt ; mount /dev/test/scratch /mnt/ > > [root@localhost ~]# ls -al /mnt/file > > -rw-r--r--. 1 root root 0 May 24 08:27 /mnt/file > > > > So our behavior has changed from forced block allocation (violating > > reservation) and writing the data, to instead return an error, and now > > to silently skip the page. > > We should never, ever allocate space that we didn't have a delalloc > reservation for in writepage/writepages. But I agree that we should > record and error. I have to admit I'm lost on where we did record > the error and why we don't do that now. I'd be happy to fix it. > Right, the error behavior came from the XFS_BMAPI_DELALLOC flag that was passed from xfs_iomap_write_allocate(). It caused xfs_bmapi_write() to detect that we were in a hole and return an error in the !COW_FORK case since we were expecting to do delalloc conversion from writeback. Note that I'm not saying there's a vector to reproduce this problem in the current code that I'm aware of. I'm just saying it's happened in the past due to bugs and I'd like to preserve some kind of basic sanity check (as an error or assert) if we have enough state available to do so. > > I suppose there are situations (i.e., races > > with truncate) where a hole is valid and the correct behavior is to skip > > the page, and this is admittedly an error condition that "should never > > happen," but can we at least add an assert somewhere in this series that > > ensures if uptodate data maps over a hole that the associated block > > offset is beyond EOF (or something of that nature)? > > We can have plenty of holes in dirty pages. However we should never > allocate blocks for them. Fortunately we stop even looking at anything > but the extent tree for block status by the end of this series for 4k > file systems, and with the next series even for small block sizes, so > that whole mismatch is a thing of the past now. Ok, so I guess writeback can see uptodate blocks over a hole if some other block in that page is dirty. Perhaps we could make sure that a dirty page has at least one block that maps to an actual extent or otherwise the page has been truncated..? I guess having another dirty block bitmap similar to iomap_page->uptodate could be required to tell for sure whether a particular block should definitely have a block on-disk or not. It may not be worth doing that just for additional error checks, but I still have to look into the last few patches to grok all the iomap_page stuff. Brian
On Thu, May 24, 2018 at 02:13:56PM -0400, Brian Foster wrote: > Ok, so I guess writeback can see uptodate blocks over a hole if some > other block in that page is dirty. Yes. > Perhaps we could make sure that a > dirty page has at least one block that maps to an actual extent or > otherwise the page has been truncated..? We have the following comment near the end of xfs_writepage_map: /* * We can end up here with no error and nothing to write if we * race with a partial page truncate on a sub-page block sized * filesystem. In that case we need to mark the page clean. */ And I'm pretty sure I managed to hit that case easily in xfstests, as my initial handling of it was wrong. So I don't think we can even check for that. > I guess having another dirty block bitmap similar to > iomap_page->uptodate could be required to tell for sure whether a > particular block should definitely have a block on-disk or not. It may > not be worth doing that just for additional error checks, but I still > have to look into the last few patches to grok all the iomap_page stuff. I don't think it's worth it. The sub-page dirty tracking has been one of the issues with the buffer head code that caused a lot of problems, and that we want to get rid of.
On Fri, May 25, 2018 at 08:19:00AM +0200, Christoph Hellwig wrote: > On Thu, May 24, 2018 at 02:13:56PM -0400, Brian Foster wrote: > > Ok, so I guess writeback can see uptodate blocks over a hole if some > > other block in that page is dirty. > > Yes. > > > Perhaps we could make sure that a > > dirty page has at least one block that maps to an actual extent or > > otherwise the page has been truncated..? > > We have the following comment near the end of xfs_writepage_map: > That comment is what I'm basing on... > /* > * We can end up here with no error and nothing to write if we > * race with a partial page truncate on a sub-page block sized > * filesystem. In that case we need to mark the page clean. > */ > So we can correctly end up with nothing to write on a dirty page, but it presumes a race with truncate. So suppose we end up with a dirty page, at least one uptodate block, count is zero (i.e., due to holes) and i_size is beyond the page. Would that not be completely bogus? If bogus, I think that would at least detect the dumb example I posted earlier. Brian > And I'm pretty sure I managed to hit that case easily in xfstests, > as my initial handling of it was wrong. So I don't think we can > even check for that. > > > I guess having another dirty block bitmap similar to > > iomap_page->uptodate could be required to tell for sure whether a > > particular block should definitely have a block on-disk or not. It may > > not be worth doing that just for additional error checks, but I still > > have to look into the last few patches to grok all the iomap_page stuff. > > I don't think it's worth it. The sub-page dirty tracking has been one > of the issues with the buffer head code that caused a lot of problems, > and that we want to get rid of. > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, May 25, 2018 at 07:35:33AM -0400, Brian Foster wrote: > That comment is what I'm basing on... > > > /* > > * We can end up here with no error and nothing to write if we > > * race with a partial page truncate on a sub-page block sized > > * filesystem. In that case we need to mark the page clean. > > */ > > > > So we can correctly end up with nothing to write on a dirty page, but it > presumes a race with truncate. So suppose we end up with a dirty page, > > at least one uptodate block, count is zero (i.e., due to holes) and > i_size is beyond the page. Would that not be completely bogus? If bogus, > I think that would at least detect the dumb example I posted earlier. The trivial file_offset >= i_size_read assert explodes pretty soon in generic/091, and already does so with the existing mainline code. I'd rather not open another can of worms right now..
On Mon, May 28, 2018 at 09:15:43AM +0200, Christoph Hellwig wrote: > On Fri, May 25, 2018 at 07:35:33AM -0400, Brian Foster wrote: > > That comment is what I'm basing on... > > > > > /* > > > * We can end up here with no error and nothing to write if we > > > * race with a partial page truncate on a sub-page block sized > > > * filesystem. In that case we need to mark the page clean. > > > */ > > > > > > > So we can correctly end up with nothing to write on a dirty page, but it > > presumes a race with truncate. So suppose we end up with a dirty page, > > > > at least one uptodate block, count is zero (i.e., due to holes) and > > i_size is beyond the page. Would that not be completely bogus? If bogus, > > I think that would at least detect the dumb example I posted earlier. > > The trivial file_offset >= i_size_read assert explodes pretty soon > in generic/091, and already does so with the existing mainline code. > What exactly is the trivial check? Can you show the code please? Brian > I'd rather not open another can of worms right now.. > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 29, 2018 at 07:26:31AM -0400, Brian Foster wrote:
> What exactly is the trivial check? Can you show the code please?
ASSERT(file_offset > i_size_read(inode)) in the !count block
at the end of xfs_writepage_map.
(file_offset replaced with page_offset(page) + offset for the mainline
code).
On Tue, May 29, 2018 at 03:08:46PM +0200, Christoph Hellwig wrote: > On Tue, May 29, 2018 at 07:26:31AM -0400, Brian Foster wrote: > > What exactly is the trivial check? Can you show the code please? > > ASSERT(file_offset > i_size_read(inode)) in the !count block > at the end of xfs_writepage_map. > > (file_offset replaced with page_offset(page) + offset for the mainline > code). Ok, so we (mainline) somehow or another end up in writeback with a page (inside EOF) with a combination of (!mapped && !uptodate) and (!mapped && uptodate) (unwritten?) buffers, none of them actually being dirty. I'm not quite sure how that happens, but I think it does rule out the count == 0 && at least one uptodate segment logic I proposed earlier. Fair enough. Given that, I'm not sure there's a good way to trigger such error detection without actual dirty state, and it's certainly not worth complicating the design just for that. Thanks for trying, at least. Hmm, that does have me wondering a bit if/how we'd end up writing back zeroed blocks over unwritten extents with no other dirty user data in the page (since the initial xfs_writepage_map() rework patch factors out the uptodate && !mapped skipping logic). Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5dd09e83c81c..a50f69c2c602 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -378,78 +378,101 @@ xfs_map_blocks( struct inode *inode, loff_t offset, struct xfs_bmbt_irec *imap, - int type) + int *type) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb, end_fsb; + int whichfork = XFS_DATA_FORK; int error = 0; - int bmapi_flags = XFS_BMAPI_ENTIRE; int nimaps = 1; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - /* - * Truncate can race with writeback since writeback doesn't take the - * iolock and truncate decreases the file size before it starts - * truncating the pages between new_size and old_size. Therefore, we - * can end up in the situation where writeback gets a CoW fork mapping - * but the truncate makes the mapping invalid and we end up in here - * trying to get a new mapping. Bail out here so that we simply never - * get a valid mapping and so we drop the write altogether. The page - * truncation will kill the contents anyway. - */ - if (type == XFS_IO_COW && offset > i_size_read(inode)) - return 0; - - ASSERT(type != XFS_IO_COW); - if (type == XFS_IO_UNWRITTEN) - bmapi_flags |= XFS_BMAPI_IGSTATE; - xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); + if (xfs_is_reflink_inode(ip) && + xfs_reflink_find_cow_mapping(ip, offset, imap)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + /* + * Truncate can race with writeback since writeback doesn't + * take the iolock and truncate decreases the file size before + * it starts truncating the pages between new_size and old_size. + * Therefore, we can end up in the situation where writeback + * gets a CoW fork mapping but the truncate makes the mapping + * invalid and we end up in here trying to get a new mapping. + * bail out here so that we simply never get a valid mapping + * and so we drop the write altogether. The page truncation + * will kill the contents anyway. + */ + if (offset > i_size_read(inode)) + return 0; + whichfork = XFS_COW_FORK; + *type = XFS_IO_COW; + goto done; + } + if (offset > mp->m_super->s_maxbytes - count) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - imap, &nimaps, bmapi_flags); - /* - * Truncate an overwrite extent if there's a pending CoW - * reservation before the end of this extent. This forces us - * to come back to writepage to take care of the CoW. - */ - if (nimaps && type == XFS_IO_OVERWRITE) + imap, &nimaps, XFS_BMAPI_ENTIRE); + if (!nimaps) { + /* + * Lookup returns no match? Beyond eof? regardless, + * return it as a hole so we don't write it + */ + imap->br_startoff = offset_fsb; + imap->br_blockcount = end_fsb - offset_fsb; + imap->br_startblock = HOLESTARTBLOCK; + *type = XFS_IO_HOLE; + } else if (imap->br_startblock == HOLESTARTBLOCK) { + /* landed in a hole */ + *type = XFS_IO_HOLE; + } else if (isnullstartblock(imap->br_startblock)) { + /* got a delalloc extent */ + *type = XFS_IO_DELALLOC; + } else { + /* + * Got an existing extent for overwrite. Truncate it if there + * is a pending CoW reservation before the end of this extent, + * so that we pick up the COW extents in the next iteration. + */ xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap); + if (imap->br_state == XFS_EXT_UNWRITTEN) + *type = XFS_IO_UNWRITTEN; + else + *type = XFS_IO_OVERWRITE; + } xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (error) return error; - if (type == XFS_IO_DELALLOC && - (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, - imap); - if (!error) - trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); - return error; - } - -#ifdef DEBUG - if (type == XFS_IO_UNWRITTEN) { - ASSERT(nimaps); - ASSERT(imap->br_startblock != HOLESTARTBLOCK); - ASSERT(imap->br_startblock != DELAYSTARTBLOCK); +done: + switch (*type) { + case XFS_IO_HOLE: + case XFS_IO_OVERWRITE: + case XFS_IO_UNWRITTEN: + /* nothing to do! */ + trace_xfs_map_blocks_found(ip, offset, count, *type, imap); + return 0; + case XFS_IO_DELALLOC: + case XFS_IO_COW: + error = xfs_iomap_write_allocate(ip, whichfork, offset, imap); + if (error) + return error; + trace_xfs_map_blocks_alloc(ip, offset, count, *type, imap); + return 0; + default: + ASSERT(1); + return -EFSCORRUPTED; } -#endif - if (nimaps) - trace_xfs_map_blocks_found(ip, offset, count, type, imap); - return 0; } STATIC bool @@ -709,6 +732,14 @@ xfs_map_at_offset( set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); + + /* + * If this is a realtime file, data may be on a different device. + * to that pointed to from the buffer_head b_bdev currently. We can't + * trust that the bufferhead has a already been mapped correctly, so + * set the bdev now. + */ + bh->b_bdev = xfs_find_bdev_for_inode(inode); } STATIC void @@ -769,56 +800,6 @@ xfs_aops_discard_page( xfs_vm_invalidatepage(page, 0, PAGE_SIZE); } -static int -xfs_map_cow( - struct xfs_writepage_ctx *wpc, - struct inode *inode, - loff_t offset, - unsigned int *new_type) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_bmbt_irec imap; - bool is_cow = false; - int error; - - /* - * If we already have a valid COW mapping keep using it. - */ - if (wpc->io_type == XFS_IO_COW) { - wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); - if (wpc->imap_valid) { - *new_type = XFS_IO_COW; - return 0; - } - } - - /* - * Else we need to check if there is a COW mapping at this offset. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!is_cow) - return 0; - - /* - * And if the COW mapping has a delayed extent here we need to - * allocate real space for it now. - */ - if (isnullstartblock(imap.br_startblock)) { - error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, - &imap); - if (error) - return error; - } - - wpc->io_type = *new_type = XFS_IO_COW; - wpc->imap_valid = true; - wpc->imap = imap; - return 0; -} - /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate @@ -845,85 +826,81 @@ xfs_writepage_map( { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; - struct buffer_head *bh, *head; + struct buffer_head *bh; ssize_t len = i_blocksize(inode); - uint64_t offset; int error = 0; int count = 0; - int uptodate = 1; - unsigned int new_type; + bool uptodate = true; + loff_t file_offset; /* file offset of page */ + unsigned poffset; /* offset into page */ - bh = head = page_buffers(page); - offset = page_offset(page); - do { - if (offset >= end_offset) + /* + * Walk the blocks on the page, and we we run off then end of the + * current map or find the current map invalid, grab a new one. + * We only use bufferheads here to check per-block state - they no + * longer control the iteration through the page. This allows us to + * replace the bufferhead with some other state tracking mechanism in + * future. + */ + file_offset = page_offset(page); + bh = page_buffers(page); + for (poffset = 0; + poffset < PAGE_SIZE; + poffset += len, file_offset += len, bh = bh->b_this_page) { + /* past the range we are writing, so nothing more to write. */ + if (file_offset >= end_offset) break; - if (!buffer_uptodate(bh)) - uptodate = 0; /* - * set_page_dirty dirties all buffers in a page, independent - * of their state. The dirty state however is entirely - * meaningless for holes (!mapped && uptodate), so skip - * buffers covering holes here. + * Block does not contain valid data, skip it, mark the current + * map as invalid because we have a discontiguity. This ensures + * we put subsequent writeable buffers into a new ioend. */ - if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - wpc->imap_valid = false; - continue; - } - - if (buffer_unwritten(bh)) - new_type = XFS_IO_UNWRITTEN; - else if (buffer_delay(bh)) - new_type = XFS_IO_DELALLOC; - else if (buffer_uptodate(bh)) - new_type = XFS_IO_OVERWRITE; - else { + if (!buffer_uptodate(bh)) { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); - /* - * This buffer is not uptodate and will not be - * written to disk. Ensure that we will put any - * subsequent writeable buffers into a new - * ioend. - */ + uptodate = false; wpc->imap_valid = false; continue; } - if (xfs_is_reflink_inode(XFS_I(inode))) { - error = xfs_map_cow(wpc, inode, offset, &new_type); - if (error) - goto out; - } - - if (wpc->io_type != new_type) { - wpc->io_type = new_type; - wpc->imap_valid = false; - } - + /* Check to see if current map spans this file offset */ if (wpc->imap_valid) wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, - offset); + file_offset); + /* + * If we don't have a valid map, now it's time to get a new one + * for this offset. This will convert delayed allocations + * (including COW ones) into real extents. If we return without + * a valid map, it means we landed in a hole and we skip the + * block. + */ if (!wpc->imap_valid) { - error = xfs_map_blocks(inode, offset, &wpc->imap, - wpc->io_type); + error = xfs_map_blocks(inode, file_offset, &wpc->imap, + &wpc->io_type); if (error) goto out; wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, - offset); + file_offset); } - if (wpc->imap_valid) { - lock_buffer(bh); - if (wpc->io_type != XFS_IO_OVERWRITE) - xfs_map_at_offset(inode, bh, &wpc->imap, offset); - xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); - count++; + + if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) { + /* + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so check we did + * have a buffer covering a hole here and continue. + */ + continue; } - } while (offset += len, ((bh = bh->b_this_page) != head)); + lock_buffer(bh); + xfs_map_at_offset(inode, bh, &wpc->imap, file_offset); + xfs_add_to_ioend(inode, bh, file_offset, wpc, wbc, &submit_list); + count++; + } - if (uptodate && bh == head) + if (uptodate && poffset == PAGE_SIZE) SetPageUptodate(page); ASSERT(wpc->ioend || list_empty(&submit_list)); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 69346d460dfa..b2ef5b661761 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -29,6 +29,7 @@ enum { XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ XFS_IO_COW, /* covers copy-on-write extent */ + XFS_IO_HOLE, /* covers region without any block allocation */ }; #define XFS_IO_TYPES \ @@ -36,7 +37,8 @@ enum { { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" } + { XFS_IO_COW, "CoW" }, \ + { XFS_IO_HOLE, "hole" } /* * Structure for buffered I/O completions.