Message ID | PUZPR04MB631680D4803CEE2B1A7F42F381A6A@PUZPR04MB6316.apcprd04.prod.outlook.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | exfat: get file size from DataLength | expand |
[snip] > + map_bh(bh_result, sb, phys); > + if (buffer_delay(bh_result)) > + clear_buffer_delay(bh_result); > + > if (create) { > + sector_t valid_blks; > + > + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); > + if (iblock < valid_blks && iblock + max_blocks >= valid_blks) { > + max_blocks = valid_blks - iblock; > + goto done; > + } I don't know why this check is needed. And Why do you call exfat_map_new_buffer() about < valid blocks ? > + > err = exfat_map_new_buffer(ei, bh_result, pos); > if (err) { > exfat_fs_error(sb,
> From: Namjae Jeon <linkinjeon@kernel.org> > Sent: Monday, November 27, 2023 8:12 AM > To: Mo, Yuezhang <Yuezhang.Mo@sony.com> > Cc: sj1557.seo@samsung.com; linux-fsdevel@vger.kernel.org; Wu, Andy <Andy.Wu@sony.com>; Aoyama, Wataru (SGC) <Wataru.Aoyama@sony.com> > Subject: Re: [PATCH v4 1/2] exfat: change to get file size from DataLength > > [snip] > > + map_bh(bh_result, sb, phys); > > + if (buffer_delay(bh_result)) > > + clear_buffer_delay(bh_result); > > + > > if (create) { > > + sector_t valid_blks; > > + > > + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); > > + if (iblock < valid_blks && iblock + max_blocks >= valid_blks) { > > + max_blocks = valid_blks - iblock; > > + goto done; > > + } > I don't know why this check is needed. And Why do you call > exfat_map_new_buffer() about < valid blocks ? If part of a block has been written, the written part needs to be read out and the unwritten part needs to be zeroed before writing. So unwritten area and written area need to be mapped separately. There is no need to call exfat_map_new_buffer() if iblock + max_blocks < valid_blks. I will update the code. > > + > > err = exfat_map_new_buffer(ei, bh_result, pos); > > if (err) { > > exfat_fs_error(sb,
> In stream extension directory entry, the ValidDataLength field describes > how far into the data stream user data has been written, and the > DataLength field describes the file size. > > Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com> > Reviewed-by: Andy Wu <Andy.Wu@sony.com> > Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com> > --- > fs/exfat/exfat_fs.h | 2 + > fs/exfat/file.c | 122 +++++++++++++++++++++++++++++++++++++++++++- > fs/exfat/inode.c | 96 ++++++++++++++++++++++++++++------ > fs/exfat/namei.c | 6 +++ > 4 files changed, 207 insertions(+), 19 deletions(-) [snip] > +static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct > +iov_iter *iter) { > + ssize_t ret; > + struct file *file = iocb->ki_filp; > + struct inode *inode = file_inode(file); > + struct exfat_inode_info *ei = EXFAT_I(inode); > + loff_t pos = iocb->ki_pos; > + loff_t valid_size; > + > + inode_lock(inode); > + > + valid_size = ei->valid_size; > + > + ret = generic_write_checks(iocb, iter); > + if (ret < 0) > + goto unlock; > + > + if (pos > valid_size) { > + ret = exfat_file_zeroed_range(file, valid_size, pos); > + if (ret < 0 && ret != -ENOSPC) { > + exfat_err(inode->i_sb, > + "write: fail to zero from %llu to %llu(%ld)", > + valid_size, pos, ret); > + } > + if (ret < 0) > + goto unlock; > + } > + > + ret = __generic_file_write_iter(iocb, iter); > + if (ret < 0) > + goto unlock; > + > + inode_unlock(inode); > + > + if (pos > valid_size && iocb_is_dsync(iocb)) { > + ssize_t err = vfs_fsync_range(file, valid_size, pos - 1, > + iocb->ki_flags & IOCB_SYNC); If there is a hole between valid_size and pos, it seems to call sync twice. Is there any reason to call separately? Why don't you call the vfs_fsync_range only once for the merged scope [valid_size:end]? > + if (err < 0) > + return err; > + } > + > + if (ret) > + ret = generic_write_sync(iocb, ret); > + > + return ret; > + > +unlock: > + inode_unlock(inode); > + > + return ret; > +} > + [snip] > @@ -75,8 +75,7 @@ int __exfat_write_inode(struct inode *inode, int sync) > if (ei->start_clu == EXFAT_EOF_CLUSTER) > on_disk_size = 0; > > - ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); > - ep2->dentry.stream.size = ep2->dentry.stream.valid_size; > + ep2->dentry.stream.size = cpu_to_le64(on_disk_size); > if (on_disk_size) { > ep2->dentry.stream.flags = ei->flags; > ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); > @@ -85,6 +84,8 @@ int __exfat_write_inode(struct inode *inode, int sync) > ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; > } > > + ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size); Is there any reason to not only change the value but also move the line down? > + > exfat_update_dir_chksum_with_entry_set(&es); > return exfat_put_dentry_set(&es, sync); } @@ -306,17 +307,25 @@ > static int exfat_get_block(struct inode *inode, sector_t iblock, > mapped_blocks = sbi->sect_per_clus - sec_offset; > max_blocks = min(mapped_blocks, max_blocks); > > - /* Treat newly added block / cluster */ > - if (iblock < last_block) > - create = 0; > - > - if (create || buffer_delay(bh_result)) { > - pos = EXFAT_BLK_TO_B((iblock + 1), sb); > + pos = EXFAT_BLK_TO_B((iblock + 1), sb); > + if ((create && iblock >= last_block) || buffer_delay(bh_result)) { > if (ei->i_size_ondisk < pos) > ei->i_size_ondisk = pos; > } > > + map_bh(bh_result, sb, phys); > + if (buffer_delay(bh_result)) > + clear_buffer_delay(bh_result); > + > if (create) { > + sector_t valid_blks; > + > + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); > + if (iblock < valid_blks && iblock + max_blocks >= valid_blks) > { > + max_blocks = valid_blks - iblock; > + goto done; > + } > + You removed the code for handling the case for (iblock < last_block). So, under all write call-flows, it could be buffer_new abnormally. It seems wrong. right? > err = exfat_map_new_buffer(ei, bh_result, pos); > if (err) { > exfat_fs_error(sb, [snip] > @@ -436,8 +485,20 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, > struct iov_iter *iter) > * condition of exfat_get_block() and ->truncate(). > */ > ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); > - if (ret < 0 && (rw & WRITE)) > - exfat_write_failed(mapping, size); > + if (ret < 0) { > + if (rw & WRITE) > + exfat_write_failed(mapping, size); > + > + if (ret != -EIOCBQUEUED) > + return ret; > + } else > + size = pos + ret; > + > + if ((rw & READ) && pos < ei->valid_size && ei->valid_size < size) { > + iov_iter_revert(iter, size - ei->valid_size); > + iov_iter_zero(size - ei->valid_size, iter); > + } This approach causes unnecessary reads to the range after valid_size, right? But it looks very simple and clear. Hum... Do you have any plan to handle the before and after of valid_size separately?
> From: Sungjong Seo <sj1557.seo@samsung.com> > Sent: Tuesday, November 28, 2023 5:21 PM > To: Mo, Yuezhang <Yuezhang.Mo@sony.com>; linkinjeon@kernel.org > Cc: linux-fsdevel@vger.kernel.org; Wu, Andy <Andy.Wu@sony.com>; Aoyama, > Wataru (SGC) <Wataru.Aoyama@sony.com>; cpgs@samsung.com; > sj1557.seo@samsung.com > Subject: RE: [PATCH v4 1/2] exfat: change to get file size from DataLength > > > In stream extension directory entry, the ValidDataLength field describes > how far into the data stream user data has been written, and the > DataLength > field describes the file size. > > > Signed-off-by: Yuezhang Mo <Yuezhang. Mo@ sony. com> > > In stream extension directory entry, the ValidDataLength field > > describes how far into the data stream user data has been written, and > > the DataLength field describes the file size. > > > > Signed-off-by: Yuezhang Mo <mailto:Yuezhang.Mo@sony.com> > > Reviewed-by: Andy Wu <mailto:Andy.Wu@sony.com> > > Reviewed-by: Aoyama Wataru <mailto:wataru.aoyama@sony.com> > > --- > > fs/exfat/exfat_fs.h | 2 + > > fs/exfat/file.c | 122 > +++++++++++++++++++++++++++++++++++++++++++- > > fs/exfat/inode.c | 96 ++++++++++++++++++++++++++++------ > > fs/exfat/namei.c | 6 +++ > > 4 files changed, 207 insertions(+), 19 deletions(-) > [snip] > > +static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct > > +iov_iter *iter) { > > + ssize_t ret; > > + struct file *file = iocb->ki_filp; > > + struct inode *inode = file_inode(file); > > + struct exfat_inode_info *ei = EXFAT_I(inode); > > + loff_t pos = iocb->ki_pos; > > + loff_t valid_size; > > + > > + inode_lock(inode); > > + > > + valid_size = ei->valid_size; > > + > > + ret = generic_write_checks(iocb, iter); > > + if (ret < 0) > > + goto unlock; > > + > > + if (pos > valid_size) { > > + ret = exfat_file_zeroed_range(file, valid_size, pos); > > + if (ret < 0 && ret != -ENOSPC) { > > + exfat_err(inode->i_sb, > > + "write: fail to zero from %llu to %llu(%ld)", > > + valid_size, pos, ret); > > + } > > + if (ret < 0) > > + goto unlock; > > + } > > + > > + ret = __generic_file_write_iter(iocb, iter); > > + if (ret < 0) > > + goto unlock; > > + > > + inode_unlock(inode); > > + > > + if (pos > valid_size && iocb_is_dsync(iocb)) { > > + ssize_t err = vfs_fsync_range(file, valid_size, pos - 1, > > + iocb->ki_flags & IOCB_SYNC); > If there is a hole between valid_size and pos, it seems to call sync twice. > Is there any reason to call separately? > Why don't you call the vfs_fsync_range only once for the merged scope > [valid_size:end]? For better debugging, I kept the original logic and added new logic for valid_size. For now, it is unnecessary, I will change to sync once. > > > + if (err < 0) > > + return err; > > + } > > + > > + if (ret) > > + ret = generic_write_sync(iocb, ret); > > + > > + return ret; > > + > > +unlock: > > + inode_unlock(inode); > > + > > + return ret; > > +} > > + > [snip] > > @@ -75,8 +75,7 @@ int __exfat_write_inode(struct inode *inode, int sync) > > if (ei->start_clu == EXFAT_EOF_CLUSTER) > > on_disk_size = 0; > > > > - ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); > > - ep2->dentry.stream.size = ep2->dentry.stream.valid_size; > > + ep2->dentry.stream.size = cpu_to_le64(on_disk_size); > > if (on_disk_size) { > > ep2->dentry.stream.flags = ei->flags; > > ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); @@ -85,6 > > +84,8 @@ int __exfat_write_inode(struct inode *inode, int sync) > > ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; > > } > > > > + ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size); > Is there any reason to not only change the value but also move the line down? I will move it back the original line. > > > + > > exfat_update_dir_chksum_with_entry_set(&es); > > return exfat_put_dentry_set(&es, sync); } @@ -306,17 +307,25 @@ > > static int exfat_get_block(struct inode *inode, sector_t iblock, > > mapped_blocks = sbi->sect_per_clus - sec_offset; > > max_blocks = min(mapped_blocks, max_blocks); > > > > - /* Treat newly added block / cluster */ > > - if (iblock < last_block) > > - create = 0; > > - > > - if (create || buffer_delay(bh_result)) { > > - pos = EXFAT_BLK_TO_B((iblock + 1), sb); > > + pos = EXFAT_BLK_TO_B((iblock + 1), sb); > > + if ((create && iblock >= last_block) || buffer_delay(bh_result)) { > > if (ei->i_size_ondisk < pos) > > ei->i_size_ondisk = pos; > > } > > > > + map_bh(bh_result, sb, phys); > > + if (buffer_delay(bh_result)) > > + clear_buffer_delay(bh_result); > > + > > if (create) { > > + sector_t valid_blks; > > + > > + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); > > + if (iblock < valid_blks && iblock + max_blocks >= valid_blks) > > { > > + max_blocks = valid_blks - iblock; > > + goto done; > > + } > > + > You removed the code for handling the case for (iblock < last_block). > So, under all write call-flows, it could be buffer_new abnormally. > It seems wrong. right? Yes, I will update this patch. Without this patch, last_block is equal with valid_blks, exfat_map_new_buffer() should be called if iblock + max_blocks > last_block. With this patch, last_block >= valid_blks, exfat_map_new_buffer() should be called if iblock + max_blocks > valid_blks. > > > err = exfat_map_new_buffer(ei, bh_result, pos); > > if (err) { > > exfat_fs_error(sb, > [snip] > > @@ -436,8 +485,20 @@ static ssize_t exfat_direct_IO(struct kiocb > > *iocb, struct iov_iter *iter) > > * condition of exfat_get_block() and ->truncate(). > > */ > > ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); > > - if (ret < 0 && (rw & WRITE)) > > - exfat_write_failed(mapping, size); > > + if (ret < 0) { > > + if (rw & WRITE) > > + exfat_write_failed(mapping, size); > > + > > + if (ret != -EIOCBQUEUED) > > + return ret; > > + } else > > + size = pos + ret; > > + > > + if ((rw & READ) && pos < ei->valid_size && ei->valid_size < size) { > > + iov_iter_revert(iter, size - ei->valid_size); > > + iov_iter_zero(size - ei->valid_size, iter); > > + } > > This approach causes unnecessary reads to the range after valid_size, right? I don't think so. If the blocks across valid_size, the iov_iter will be handle as 1. Read the blocks before valid_size. 2. Read the block where valid_size is located and set the area after valid_size to zero. 3. zero the buffer of the blocks after valid_size(not read from disk) So there are unnecessary zeroing here(in 1 and 2), no unnecessary reads. I will remove the unnecessary zeroing. > But it looks very simple and clear. > > Hum... > Do you have any plan to handle the before and after of valid_size separately? >
[snip] > > > + if (pos > valid_size && iocb_is_dsync(iocb)) { > > > + ssize_t err = vfs_fsync_range(file, valid_size, pos - 1, > > > + iocb->ki_flags & IOCB_SYNC); > > If there is a hole between valid_size and pos, it seems to call sync > twice. > > Is there any reason to call separately? > > Why don't you call the vfs_fsync_range only once for the merged scope > > [valid_size:end]? > > For better debugging, I kept the original logic and added new logic for > valid_size. > For now, it is unnecessary, I will change to sync once. Thanks. > > > [snip] > > Is there any reason to not only change the value but also move the line > down? > > I will move it back the original line. Sounds good! > > > > > > + > > > exfat_update_dir_chksum_with_entry_set(&es); > > > return exfat_put_dentry_set(&es, sync); } @@ -306,17 +307,25 @@ > > > static int exfat_get_block(struct inode *inode, sector_t iblock, > > > mapped_blocks = sbi->sect_per_clus - sec_offset; > > > max_blocks = min(mapped_blocks, max_blocks); > > > > > > - /* Treat newly added block / cluster */ > > > - if (iblock < last_block) > > > - create = 0; > > > - > > > - if (create || buffer_delay(bh_result)) { > > > - pos = EXFAT_BLK_TO_B((iblock + 1), sb); > > > + pos = EXFAT_BLK_TO_B((iblock + 1), sb); > > > + if ((create && iblock >= last_block) || buffer_delay(bh_result)) { > > > if (ei->i_size_ondisk < pos) > > > ei->i_size_ondisk = pos; > > > } > > > > > > + map_bh(bh_result, sb, phys); > > > + if (buffer_delay(bh_result)) > > > + clear_buffer_delay(bh_result); > > > + > > > if (create) { > > > + sector_t valid_blks; > > > + > > > + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); > > > + if (iblock < valid_blks && iblock + max_blocks >= valid_blks) > > > { > > > + max_blocks = valid_blks - iblock; > > > + goto done; > > > + } > > > + > > You removed the code for handling the case for (iblock < last_block). > > So, under all write call-flows, it could be buffer_new abnormally. > > It seems wrong. right? > > Yes, I will update this patch. > > Without this patch, last_block is equal with valid_blks, > exfat_map_new_buffer() should be called if iblock + max_blocks > > last_block. > > With this patch, last_block >= valid_blks, exfat_map_new_buffer() should > be called if iblock + max_blocks > valid_blks. Okay. > > > > > > err = exfat_map_new_buffer(ei, bh_result, pos); > > > if (err) { > > > exfat_fs_error(sb, > > [snip] > > > @@ -436,8 +485,20 @@ static ssize_t exfat_direct_IO(struct kiocb > > > *iocb, struct iov_iter *iter) > > > * condition of exfat_get_block() and ->truncate(). > > > */ > > > ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); > > > - if (ret < 0 && (rw & WRITE)) > > > - exfat_write_failed(mapping, size); > > > + if (ret < 0) { > > > + if (rw & WRITE) > > > + exfat_write_failed(mapping, size); > > > + > > > + if (ret != -EIOCBQUEUED) > > > + return ret; > > > + } else > > > + size = pos + ret; > > > + > > > + if ((rw & READ) && pos < ei->valid_size && ei->valid_size < size) { > > > + iov_iter_revert(iter, size - ei->valid_size); > > > + iov_iter_zero(size - ei->valid_size, iter); > > > + } > > > > This approach causes unnecessary reads to the range after valid_size, > right? > > I don't think so. > > If the blocks across valid_size, the iov_iter will be handle as 1. Read > the blocks before valid_size. > 2. Read the block where valid_size is located and set the area after > valid_size to zero. > 3. zero the buffer of the blocks after valid_size(not read from disk) > > So there are unnecessary zeroing here(in 1 and 2), no unnecessary reads. > I will remove the unnecessary zeroing. You are right. There might be no need to change. It could be handled in do_direct_IO() with get_block newly modifed. Thanks. B. R. Sungjong Seo
> > > [snip] > > > > @@ -436,8 +485,20 @@ static ssize_t exfat_direct_IO(struct kiocb > > > > *iocb, struct iov_iter *iter) > > > > * condition of exfat_get_block() and ->truncate(). > > > > */ > > > > ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); > > > > - if (ret < 0 && (rw & WRITE)) > > > > - exfat_write_failed(mapping, size); > > > > + if (ret < 0) { > > > > + if (rw & WRITE) > > > > + exfat_write_failed(mapping, size); > > > > + > > > > + if (ret != -EIOCBQUEUED) > > > > + return ret; > > > > + } else > > > > + size = pos + ret; > > > > + > > > > + if ((rw & READ) && pos < ei->valid_size && ei->valid_size < size) { > > > > + iov_iter_revert(iter, size - ei->valid_size); > > > > + iov_iter_zero(size - ei->valid_size, iter); > > > > + } > > > > > > This approach causes unnecessary reads to the range after > > > valid_size, > > right? > > > > I don't think so. > > > > If the blocks across valid_size, the iov_iter will be handle as 1. > > Read the blocks before valid_size. > > 2. Read the block where valid_size is located and set the area after > > valid_size to zero. > > 3. zero the buffer of the blocks after valid_size(not read from disk) > > > > So there are unnecessary zeroing here(in 1 and 2), no unnecessary reads. > > I will remove the unnecessary zeroing. > > You are right. There might be no need to change. > It could be handled in do_direct_IO() with get_block newly modifed. Since "size = pos + ret;" is updated to the actual read position, there is no unnecessary zeroing. The code does not need to be updated.
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index a7a2c35d74fb..e3b1f8e022df 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -208,6 +208,7 @@ struct exfat_dir_entry { unsigned char flags; unsigned short attr; loff_t size; + loff_t valid_size; unsigned int num_subdirs; struct timespec64 atime; struct timespec64 mtime; @@ -317,6 +318,7 @@ struct exfat_inode_info { loff_t i_size_aligned; /* on-disk position of directory entry or 0 */ loff_t i_pos; + loff_t valid_size; /* hash by i_location */ struct hlist_node i_hash_fat; /* protect bmap against truncate */ diff --git a/fs/exfat/file.c b/fs/exfat/file.c index bfdfafe00993..7fec9a660734 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -11,6 +11,7 @@ #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/msdos_fs.h> +#include <linux/writeback.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -26,6 +27,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) return err; inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + EXFAT_I(inode)->valid_size = size; mark_inode_dirty(inode); if (!IS_SYNC(inode)) @@ -146,6 +148,9 @@ int __exfat_truncate(struct inode *inode) ei->start_clu = EXFAT_EOF_CLUSTER; } + if (i_size_read(inode) < ei->valid_size) + ei->valid_size = i_size_read(inode); + if (ei->type == TYPE_FILE) ei->attr |= EXFAT_ATTR_ARCHIVE; @@ -474,15 +479,128 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return blkdev_issue_flush(inode->i_sb->s_bdev); } +static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end) +{ + int err; + struct inode *inode = file_inode(file); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *ops = mapping->a_ops; + + while (start < end) { + u32 zerofrom, len; + struct page *page = NULL; + + zerofrom = start & (PAGE_SIZE - 1); + len = PAGE_SIZE - zerofrom; + if (start + len > end) + len = end - start; + + err = ops->write_begin(file, mapping, start, len, &page, NULL); + if (err) + goto out; + + zero_user_segment(page, zerofrom, zerofrom + len); + + err = ops->write_end(file, mapping, start, len, len, page, NULL); + if (err < 0) + goto out; + start += len; + + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + + ei->valid_size = end; + mark_inode_dirty(inode); + +out: + return err; +} + +static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t ret; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t pos = iocb->ki_pos; + loff_t valid_size; + + inode_lock(inode); + + valid_size = ei->valid_size; + + ret = generic_write_checks(iocb, iter); + if (ret < 0) + goto unlock; + + if (pos > valid_size) { + ret = exfat_file_zeroed_range(file, valid_size, pos); + if (ret < 0 && ret != -ENOSPC) { + exfat_err(inode->i_sb, + "write: fail to zero from %llu to %llu(%ld)", + valid_size, pos, ret); + } + if (ret < 0) + goto unlock; + } + + ret = __generic_file_write_iter(iocb, iter); + if (ret < 0) + goto unlock; + + inode_unlock(inode); + + if (pos > valid_size && iocb_is_dsync(iocb)) { + ssize_t err = vfs_fsync_range(file, valid_size, pos - 1, + iocb->ki_flags & IOCB_SYNC); + if (err < 0) + return err; + } + + if (ret) + ret = generic_write_sync(iocb, ret); + + return ret; + +unlock: + inode_unlock(inode); + + return ret; +} + +static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + struct inode *inode = file_inode(file); + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t start = ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + loff_t end = min_t(loff_t, i_size_read(inode), + start + vma->vm_end - vma->vm_start); + + if ((vma->vm_flags & VM_WRITE) && ei->valid_size < end) { + ret = exfat_file_zeroed_range(file, ei->valid_size, end); + if (ret < 0) { + exfat_err(inode->i_sb, + "mmap: fail to zero from %llu to %llu(%d)", + start, end, ret); + return ret; + } + } + + return generic_file_mmap(file, vma); +} + const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, + .write_iter = exfat_file_write_iter, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = exfat_file_mmap, .fsync = exfat_file_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index e7ff58b8e68c..4a06df3ed4b7 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -75,8 +75,7 @@ int __exfat_write_inode(struct inode *inode, int sync) if (ei->start_clu == EXFAT_EOF_CLUSTER) on_disk_size = 0; - ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); - ep2->dentry.stream.size = ep2->dentry.stream.valid_size; + ep2->dentry.stream.size = cpu_to_le64(on_disk_size); if (on_disk_size) { ep2->dentry.stream.flags = ei->flags; ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); @@ -85,6 +84,8 @@ int __exfat_write_inode(struct inode *inode, int sync) ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; } + ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size); + exfat_update_dir_chksum_with_entry_set(&es); return exfat_put_dentry_set(&es, sync); } @@ -306,17 +307,25 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, mapped_blocks = sbi->sect_per_clus - sec_offset; max_blocks = min(mapped_blocks, max_blocks); - /* Treat newly added block / cluster */ - if (iblock < last_block) - create = 0; - - if (create || buffer_delay(bh_result)) { - pos = EXFAT_BLK_TO_B((iblock + 1), sb); + pos = EXFAT_BLK_TO_B((iblock + 1), sb); + if ((create && iblock >= last_block) || buffer_delay(bh_result)) { if (ei->i_size_ondisk < pos) ei->i_size_ondisk = pos; } + map_bh(bh_result, sb, phys); + if (buffer_delay(bh_result)) + clear_buffer_delay(bh_result); + if (create) { + sector_t valid_blks; + + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); + if (iblock < valid_blks && iblock + max_blocks >= valid_blks) { + max_blocks = valid_blks - iblock; + goto done; + } + err = exfat_map_new_buffer(ei, bh_result, pos); if (err) { exfat_fs_error(sb, @@ -324,11 +333,35 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, pos, ei->i_size_aligned); goto unlock_ret; } + } else { + size_t b_size = EXFAT_BLK_TO_B(max_blocks, sb); + + pos -= sb->s_blocksize; + if (pos >= ei->valid_size) { + /* Read out of valid data */ + clear_buffer_mapped(bh_result); + } else if (pos + b_size <= ei->valid_size) { + /* Normal read */ + } else if (pos + sb->s_blocksize <= ei->valid_size) { + /* Normal short read */ + max_blocks = 1; + } else { + /* Read across valid size */ + if (bh_result->b_folio) { + loff_t size = ei->valid_size - pos; + loff_t off = pos & (PAGE_SIZE - 1); + + folio_set_bh(bh_result, bh_result->b_folio, off); + err = bh_read(bh_result, 0); + if (err < 0) + goto unlock_ret; + + folio_zero_segment(bh_result->b_folio, off + size, + off + sb->s_blocksize); + } + max_blocks = 1; + } } - - if (buffer_delay(bh_result)) - clear_buffer_delay(bh_result); - map_bh(bh_result, sb, phys); done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); unlock_ret: @@ -343,6 +376,17 @@ static int exfat_read_folio(struct file *file, struct folio *folio) static void exfat_readahead(struct readahead_control *rac) { + struct address_space *mapping = rac->mapping; + struct inode *inode = mapping->host; + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t pos = readahead_pos(rac); + + /* Range cross valid_size, read it page by page. */ + if (ei->valid_size < i_size_read(inode) && + pos <= ei->valid_size && + ei->valid_size < pos + readahead_length(rac)) + return; + mpage_readahead(rac, exfat_get_block); } @@ -370,9 +414,7 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, int ret; *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, - exfat_get_block, - &EXFAT_I(mapping->host)->i_size_ondisk); + ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); @@ -400,6 +442,11 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, if (err < len) exfat_write_failed(mapping, pos+len); + if (!(err < 0) && pos + err > ei->valid_size) { + ei->valid_size = pos + err; + mark_inode_dirty(inode); + } + if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ei->attr |= EXFAT_ATTR_ARCHIVE; @@ -413,6 +460,8 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t pos = iocb->ki_pos; loff_t size = iocb->ki_pos + iov_iter_count(iter); int rw = iov_iter_rw(iter); ssize_t ret; @@ -436,8 +485,20 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * condition of exfat_get_block() and ->truncate(). */ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); - if (ret < 0 && (rw & WRITE)) - exfat_write_failed(mapping, size); + if (ret < 0) { + if (rw & WRITE) + exfat_write_failed(mapping, size); + + if (ret != -EIOCBQUEUED) + return ret; + } else + size = pos + ret; + + if ((rw & READ) && pos < ei->valid_size && ei->valid_size < size) { + iov_iter_revert(iter, size - ei->valid_size); + iov_iter_zero(size - ei->valid_size, iter); + } + return ret; } @@ -537,6 +598,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) ei->start_clu = info->start_clu; ei->flags = info->flags; ei->type = info->type; + ei->valid_size = info->valid_size; ei->version = 0; ei->hint_stat.eidx = 0; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 5d737e0b639a..9c549fd11fc8 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -406,6 +406,7 @@ static int exfat_find_empty_entry(struct inode *inode, i_size_write(inode, size); ei->i_size_ondisk += sbi->cluster_size; ei->i_size_aligned += sbi->cluster_size; + ei->valid_size += sbi->cluster_size; ei->flags = p_dir->flags; inode->i_blocks += sbi->cluster_size >> 9; } @@ -558,6 +559,8 @@ static int exfat_add_entry(struct inode *inode, const char *path, info->size = clu_size; info->num_subdirs = EXFAT_MIN_SUBDIR; } + info->valid_size = info->size; + memset(&info->crtime, 0, sizeof(info->crtime)); memset(&info->mtime, 0, sizeof(info->mtime)); memset(&info->atime, 0, sizeof(info->atime)); @@ -660,6 +663,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); info->size = le64_to_cpu(ep2->dentry.stream.valid_size); + info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); + info->size = le64_to_cpu(ep2->dentry.stream.size); if (info->size == 0) { info->flags = ALLOC_NO_FAT_CHAIN; info->start_clu = EXFAT_EOF_CLUSTER; @@ -1288,6 +1293,7 @@ static int __exfat_rename(struct inode *old_parent_inode, } i_size_write(new_inode, 0); + new_ei->valid_size = 0; new_ei->start_clu = EXFAT_EOF_CLUSTER; new_ei->flags = ALLOC_NO_FAT_CHAIN; }