[RFC,v3,19/26] ext4: implement writeback iomap path

Message ID	20240127015825.1608160-20-yi.zhang@huaweicloud.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Zhang Yi <yi.zhang@huaweicloud.com> To: linux-ext4@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, tytso@mit.edu, adilger.kernel@dilger.ca, jack@suse.cz, ritesh.list@gmail.com, hch@infradead.org, djwong@kernel.org, willy@infradead.org, zokeefe@google.com, yi.zhang@huawei.com, yi.zhang@huaweicloud.com, chengzhihao1@huawei.com, yukuai3@huawei.com, wangkefeng.wang@huawei.com Subject: [RFC PATCH v3 19/26] ext4: implement writeback iomap path Date: Sat, 27 Jan 2024 09:58:18 +0800 Message-Id: <20240127015825.1608160-20-yi.zhang@huaweicloud.com> In-Reply-To: <20240127015825.1608160-1-yi.zhang@huaweicloud.com> References: <20240127015825.1608160-1-yi.zhang@huaweicloud.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	ext4: use iomap for regular file's buffered IO path and enable large foilo \| expand [RFC,v3,00/26] ext4: use iomap for regular file's buffered IO path and enable large foilo [v3,01/26] ext4: refactor ext4_da_map_blocks() [v3,02/26] ext4: convert to exclusive lock while inserting delalloc extents [v3,03/26] ext4: correct the hole length returned by ext4_map_blocks() [v3,04/26] ext4: add a hole extent entry in cache after punch [v3,05/26] ext4: make ext4_map_blocks() distinguish delalloc only extent [v3,06/26] ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC map type [RFC,v3,07/26] iomap: don't increase i_size if it's not a write operation [RFC,v3,08/26] iomap: add pos and dirty_len into trace_iomap_writepage_map [RFC,v3,09/26] ext4: allow inserting delalloc extents with multi-blocks [RFC,v3,10/26] ext4: correct delalloc extent length [RFC,v3,11/26] ext4: also mark extent as delalloc if it's been unwritten [RFC,v3,12/26] ext4: factor out bh handles to ext4_da_get_block_prep() [RFC,v3,13/26] ext4: use reserved metadata blocks when splitting extent in endio [RFC,v3,14/26] ext4: factor out ext4_map_{create\|query}_blocks() [RFC,v3,15/26] ext4: introduce seq counter for extent entry [RFC,v3,16/26] ext4: add a new iomap aops for regular file's buffered IO path [RFC,v3,17/26] ext4: implement buffered read iomap path [RFC,v3,18/26] ext4: implement buffered write iomap path [RFC,v3,19/26] ext4: implement writeback iomap path [RFC,v3,20/26] ext4: implement mmap iomap path [RFC,v3,21/26] ext4: implement zero_range iomap path [RFC,v3,22/26] ext4: writeback partial blocks before zero range [RFC,v3,23/26] ext4: fall back to buffer_head path for defrag [RFC,v3,24/26] ext4: partially enable iomap for regular file's buffered IO path [RFC,v3,25/26] filemap: support disable large folios on active inode [RFC,v3,26/26] ext4: enable large folio for regular file with iomap buffered IO path

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 03cdcf3d86a5..eaf29bade606 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1147,6 +1147,8 @@ struct ext4_inode_info { */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; + struct list_head i_iomap_ioend_list; + struct work_struct i_iomap_ioend_work; atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -3755,6 +3757,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); +extern void ext4_iomap_end_io(struct work_struct *work); +extern void ext4_iomap_end_bio(struct bio *bio); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d1a2e6624401..94c8073b49e7 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -11,6 +11,12 @@ int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* + * Ordered mode is no longer needed for the inode that use the + * iomap path, always use writeback mode. + */ + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 48d6d125ec37..46805b8e7bdc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3708,18 +3708,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested it is a clear sign that we still - * have some extent state machine issues left. So extent_split is still - * required. - * TODO: Once all related issues will be fixed this situation should be - * illegal. + /* + * For the inodes that use the buffered iomap path need to split + * extents in endio, other inodes not. + * + * TODO: Reserve enough sapce for splitting extents, always split + * extents here, and totally remove this warning. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef CONFIG_EXT4_DEBUG - ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," - " len %u; IO logical block %llu, len %u", - inode->i_ino, (unsigned long long)ee_block, ee_len, - (unsigned long long)map->m_lblk, map->m_len); + if (!ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) { + ext4_warning(inode->i_sb, + "Inode (%ld) finished: extent logical block %llu, " + "len %u; IO logical block %llu, len %u", + inode->i_ino, (unsigned long long)ee_block, + ee_len, (unsigned long long)map->m_lblk, + map->m_len); + } #endif err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c48aca637896..8ee0eca6a1e8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -43,6 +43,7 @@ #include <linux/iversion.h> #include "ext4_jbd2.h" +#include "ext4_extents.h" #include "xattr.h" #include "acl.h" #include "truncate.h" @@ -3705,10 +3706,197 @@ static void ext4_iomap_readahead(struct readahead_control *rac) iomap_readahead(rac, &ext4_iomap_buffered_read_ops); } +struct ext4_writeback_ctx { + struct iomap_writepage_ctx ctx; + struct writeback_control *wbc; + unsigned int data_seq; +}; + +static int ext4_iomap_map_one_extent(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct extent_status es; + handle_t *handle = NULL; + int credits, map_flags; + int retval; + + credits = ext4_da_writepages_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + map->m_flags = 0; + /* + * In order to protect from the race of truncate, we have to lookup + * extent stats and map blocks under i_data_sem, otherwise the + * delalloc extent could be stale. + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (likely(ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es))) { + retval = es.es_len - (map->m_lblk - es.es_lblk); + map->m_len = min_t(unsigned int, retval, map->m_len); + + if (likely(ext4_es_is_delonly(&es))) { + trace_ext4_da_write_pages_extent(inode, map); + /* + * Call ext4_map_create_blocks() to allocate any delayed + * allocation blocks. It is possible that we're going to + * need more metadata blocks, however we must not fail + * because we're in writeback and there is nothing we + * can do so it might result in data loss. So use + * reserved blocks to allocate metadata if possible. + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | + EXT4_GET_BLOCKS_METADATA_NOFAIL | + EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + retval = ext4_map_create_blocks(handle, inode, map, + map_flags); + } + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - + es.es_lblk; + map->m_flags = ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + } + } else { + retval = ext4_map_query_blocks(handle, inode, map, 0); + } + + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + return retval < 0 ? retval : 0; +} + +static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset, + unsigned int dirty_len) +{ + struct ext4_writeback_ctx *ewpc = + container_of(wpc, struct ext4_writeback_ctx, ctx); + struct super_block *sb = inode->i_sb; + struct journal_s *journal = EXT4_SB(sb)->s_journal; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int index = offset >> blkbits; + unsigned int end, len; + int ret; + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + + /* Check validity of the cached writeback mapping. */ + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length && + ewpc->data_seq == READ_ONCE(ei->i_es_seq)) + return 0; + + end = min_t(unsigned int, + (ewpc->wbc->range_end >> blkbits), (UINT_MAX - 1)); + len = (end > index + dirty_len) ? end - index + 1 : dirty_len; + +retry: + map.m_lblk = index; + map.m_len = min_t(unsigned int, EXT_UNWRITTEN_MAX_LEN, len); + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + + /* + * The map isn't a delalloc extent, it must be a hole or have + * already been allocated. + */ + if (!(map.m_flags & EXT4_MAP_DELAYED)) + goto out; + + /* Map one delalloc extent. */ + ret = ext4_iomap_map_one_extent(inode, &map); + if (ret < 0) { + if (ext4_forced_shutdown(sb)) + return ret; + + /* + * Retry transient ENOSPC errors, if + * ext4_count_free_blocks() is non-zero, a commit + * should free up blocks. + */ + if (ret == -ENOSPC && ext4_count_free_clusters(sb)) { + jbd2_journal_force_commit_nested(journal); + goto retry; + } + + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with " + "max blocks %u with error %d", + inode->i_ino, (unsigned long long)map.m_lblk, + (unsigned int)map.m_len, -ret); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (ret == -ENOSPC) + ext4_print_free_blocks(inode); + return ret; + } +out: + ewpc->data_seq = READ_ONCE(ei->i_es_seq); + ext4_set_iomap(inode, &wpc->iomap, &map, offset, + map.m_len << blkbits, 0); + return 0; +} + +static int ext4_iomap_prepare_ioend(struct iomap_ioend *ioend, int status) +{ + struct ext4_inode_info *ei = EXT4_I(ioend->io_inode); + + /* Need to convert unwritten extents when I/Os are completed. */ + if (ioend->io_type == IOMAP_UNWRITTEN || + ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize)) + ioend->io_bio.bi_end_io = ext4_iomap_end_bio; + + return status; +} + +static void ext4_iomap_discard_folio(struct folio *folio, loff_t pos) +{ + struct inode *inode = folio->mapping->host; + + ext4_iomap_punch_delalloc(inode, pos, + folio_pos(folio) + folio_size(folio) - pos); +} + +static const struct iomap_writeback_ops ext4_writeback_ops = { + .map_blocks = ext4_iomap_map_blocks, + .prepare_ioend = ext4_iomap_prepare_ioend, + .discard_folio = ext4_iomap_discard_folio, +}; + static int ext4_iomap_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return 0; + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + long nr = wbc->nr_to_write; + int alloc_ctx, ret; + struct ext4_writeback_ctx ewpc = { + .wbc = wbc, + }; + + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + + alloc_ctx = ext4_writepages_down_read(sb); + trace_ext4_writepages(inode, wbc); + ret = iomap_writepages(mapping, wbc, &ewpc.ctx, &ext4_writeback_ops); + trace_ext4_writepages_result(inode, wbc, ret, nr - wbc->nr_to_write); + ext4_writepages_up_read(sb, alloc_ctx); + + return ret; } /* diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dfdd7e5cf038..a499208500e4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -22,6 +22,7 @@ #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> +#include <linux/iomap.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/sched/mm.h> @@ -565,3 +566,109 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, return 0; } + +static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend) +{ + struct inode *inode = ioend->io_inode; + struct ext4_inode_info *ei = EXT4_I(inode); + loff_t pos = ioend->io_offset; + size_t size = ioend->io_size; + loff_t new_disksize; + handle_t *handle; + int credits; + int ret, err; + + ret = blk_status_to_errno(ioend->io_bio.bi_status); + if (unlikely(ret)) + goto out; + + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + credits = ext4_chunk_trans_blocks(inode, + EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits)); + handle = ext4_journal_start(inode, EXT4_HT_EXT_CONVERT, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_err; + } + + if (ioend->io_type == IOMAP_UNWRITTEN) { + ret = ext4_convert_unwritten_extents(handle, inode, pos, size); + if (ret) + goto out_journal; + } + + /* + * Update on-disk size after IO is completed. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ + new_disksize = pos + size; + if (new_disksize > READ_ONCE(ei->i_disksize)) { + down_write(&ei->i_data_sem); + new_disksize = min(new_disksize, i_size_read(inode)); + if (new_disksize > ei->i_disksize) + ei->i_disksize = new_disksize; + up_write(&ei->i_data_sem); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + EXT4_ERROR_INODE_ERR(inode, -ret, + "Failed to mark inode dirty"); + } + +out_journal: + err = ext4_journal_stop(handle); + if (!ret) + ret = err; +out_err: + if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to " + "written extents or update inode size -- " + "potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + } +out: + iomap_finish_ioends(ioend, ret); +} + +/* + * Work on buffered iomap completed IO, to convert unwritten extents to + * mapped extents + */ +void ext4_iomap_end_io(struct work_struct *work) +{ + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_iomap_ioend_work); + struct iomap_ioend *ioend; + struct list_head ioend_list; + unsigned long flags; + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_replace_init(&ei->i_iomap_ioend_list, &ioend_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + iomap_sort_ioends(&ioend_list); + while (!list_empty(&ioend_list)) { + ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + iomap_ioend_try_merge(ioend, &ioend_list); + ext4_iomap_finish_ioend(ioend); + } +} + +void ext4_iomap_end_bio(struct bio *bio) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + struct ext4_inode_info *ei = EXT4_I(ioend->io_inode); + struct ext4_sb_info *sbi = EXT4_SB(ioend->io_inode->i_sb); + unsigned long flags; + + /* Only reserved conversions from writeback should enter here */ + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (list_empty(&ei->i_iomap_ioend_list)) + queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work); + list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d4afaae8ab38..811a46f83bb9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1431,11 +1431,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); + INIT_LIST_HEAD(&ei->i_iomap_ioend_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); + INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode;

[RFC,v3,19/26] ext4: implement writeback iomap path

Commit Message

Patch