Message ID | 20240510023906.281700-1-chao@kernel.org (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [f2fs-dev,v2] f2fs: fix to avoid racing in between read and OPU dio write | expand |
On 05/10, Chao Yu wrote: > If lfs mode is on, buffered read may race w/ OPU dio write as below, > it may cause buffered read hits unwritten data unexpectly, and for > dio read, the race condition exists as well. > > Thread A Thread B > - f2fs_file_write_iter > - f2fs_dio_write_iter > - __iomap_dio_rw > - f2fs_iomap_begin > - f2fs_map_blocks > - __allocate_data_block > - allocated blkaddr #x > - iomap_dio_submit_bio > - f2fs_file_read_iter > - filemap_read > - f2fs_read_data_folio > - f2fs_mpage_readpages > - f2fs_map_blocks > : get blkaddr #x > - f2fs_submit_read_bio > IRQ > - f2fs_read_end_io > : read IO on blkaddr #x complete > IRQ > - iomap_dio_bio_end_io > : direct write IO on blkaddr #x complete > > This patch introduces a new per-inode i_opu_rwsem lock to avoid > such race condition. Wasn't this supposed to be managed by user-land? > > Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") > Signed-off-by: Chao Yu <chao@kernel.org> > --- > v2: > - fix to cover dio read path w/ i_opu_rwsem as well. > fs/f2fs/f2fs.h | 1 + > fs/f2fs/file.c | 28 ++++++++++++++++++++++++++-- > fs/f2fs/super.c | 1 + > 3 files changed, 28 insertions(+), 2 deletions(-) > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index 30058e16a5d0..91cf4b3d6bc6 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -847,6 +847,7 @@ struct f2fs_inode_info { > /* avoid racing between foreground op and gc */ > struct f2fs_rwsem i_gc_rwsem[2]; > struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ > + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */ > > int i_extra_isize; /* size of extra space located in i_addr */ > kprojid_t i_projid; /* id for project quota */ > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > index 72ce1a522fb2..4ec260af321f 100644 > --- a/fs/f2fs/file.c > +++ b/fs/f2fs/file.c > @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > const loff_t pos = iocb->ki_pos; > const size_t count = iov_iter_count(to); > struct iomap_dio *dio; > + bool do_opu = f2fs_lfs_mode(sbi); > ssize_t ret; > > if (count == 0) > @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > ret = -EAGAIN; > goto out; > } > + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) { > + f2fs_up_read(&fi->i_gc_rwsem[READ]); > + ret = -EAGAIN; > + goto out; > + } > } else { > f2fs_down_read(&fi->i_gc_rwsem[READ]); > + f2fs_down_read(&fi->i_opu_rwsem); > } > > /* > @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > ret = iomap_dio_complete(dio); > } > > + f2fs_up_read(&fi->i_opu_rwsem); > f2fs_up_read(&fi->i_gc_rwsem[READ]); > > file_accessed(file); > @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > if (f2fs_should_use_dio(inode, iocb, to)) { > ret = f2fs_dio_read_iter(iocb, to); > } else { > + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode)); > + > + if (do_opu) > + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem); > ret = filemap_read(iocb, to, 0); > + if (do_opu) > + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem); > if (ret > 0) > f2fs_update_iostat(F2FS_I_SB(inode), inode, > APP_BUFFERED_READ_IO, ret); > @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, > ret = -EAGAIN; > goto out; > } > + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) { > + f2fs_up_read(&fi->i_gc_rwsem[READ]); > + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); > + ret = -EAGAIN; > + goto out; > + } > } else { > ret = f2fs_convert_inline_inode(inode); > if (ret) > goto out; > > f2fs_down_read(&fi->i_gc_rwsem[WRITE]); > - if (do_opu) > + if (do_opu) { > f2fs_down_read(&fi->i_gc_rwsem[READ]); > + f2fs_down_write(&fi->i_opu_rwsem); > + } > } > > /* > @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, > ret = iomap_dio_complete(dio); > } > > - if (do_opu) > + if (do_opu) { > + f2fs_up_write(&fi->i_opu_rwsem); > f2fs_up_read(&fi->i_gc_rwsem[READ]); > + } > f2fs_up_read(&fi->i_gc_rwsem[WRITE]); > > if (ret < 0) > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c > index daf2c4dbe150..b4ed3b094366 100644 > --- a/fs/f2fs/super.c > +++ b/fs/f2fs/super.c > @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) > init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); > init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); > init_f2fs_rwsem(&fi->i_xattr_sem); > + init_f2fs_rwsem(&fi->i_opu_rwsem); > > /* Will be used by directory only */ > fi->i_dir_level = F2FS_SB(sb)->dir_level; > -- > 2.40.1
On 2024/5/15 0:09, Jaegeuk Kim wrote: > On 05/10, Chao Yu wrote: >> If lfs mode is on, buffered read may race w/ OPU dio write as below, >> it may cause buffered read hits unwritten data unexpectly, and for >> dio read, the race condition exists as well. >> >> Thread A Thread B >> - f2fs_file_write_iter >> - f2fs_dio_write_iter >> - __iomap_dio_rw >> - f2fs_iomap_begin >> - f2fs_map_blocks >> - __allocate_data_block >> - allocated blkaddr #x >> - iomap_dio_submit_bio >> - f2fs_file_read_iter >> - filemap_read >> - f2fs_read_data_folio >> - f2fs_mpage_readpages >> - f2fs_map_blocks >> : get blkaddr #x >> - f2fs_submit_read_bio >> IRQ >> - f2fs_read_end_io >> : read IO on blkaddr #x complete >> IRQ >> - iomap_dio_bio_end_io >> : direct write IO on blkaddr #x complete >> >> This patch introduces a new per-inode i_opu_rwsem lock to avoid >> such race condition. > > Wasn't this supposed to be managed by user-land? Actually, the test case is: 1. mount w/ lfs mode 2. touch file; 3. initialize file w/ 4k zeroed data; fsync; 4. continue triggering dio write 4k zeroed data to file; 5. and meanwhile, continue triggering buf/dio 4k read in file, use md5sum to verify the 4k data; It expects data is all zero, however it turned out it's not. Thanks, > >> >> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") >> Signed-off-by: Chao Yu <chao@kernel.org> >> --- >> v2: >> - fix to cover dio read path w/ i_opu_rwsem as well. >> fs/f2fs/f2fs.h | 1 + >> fs/f2fs/file.c | 28 ++++++++++++++++++++++++++-- >> fs/f2fs/super.c | 1 + >> 3 files changed, 28 insertions(+), 2 deletions(-) >> >> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h >> index 30058e16a5d0..91cf4b3d6bc6 100644 >> --- a/fs/f2fs/f2fs.h >> +++ b/fs/f2fs/f2fs.h >> @@ -847,6 +847,7 @@ struct f2fs_inode_info { >> /* avoid racing between foreground op and gc */ >> struct f2fs_rwsem i_gc_rwsem[2]; >> struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ >> + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */ >> >> int i_extra_isize; /* size of extra space located in i_addr */ >> kprojid_t i_projid; /* id for project quota */ >> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c >> index 72ce1a522fb2..4ec260af321f 100644 >> --- a/fs/f2fs/file.c >> +++ b/fs/f2fs/file.c >> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >> const loff_t pos = iocb->ki_pos; >> const size_t count = iov_iter_count(to); >> struct iomap_dio *dio; >> + bool do_opu = f2fs_lfs_mode(sbi); >> ssize_t ret; >> >> if (count == 0) >> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >> ret = -EAGAIN; >> goto out; >> } >> + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) { >> + f2fs_up_read(&fi->i_gc_rwsem[READ]); >> + ret = -EAGAIN; >> + goto out; >> + } >> } else { >> f2fs_down_read(&fi->i_gc_rwsem[READ]); >> + f2fs_down_read(&fi->i_opu_rwsem); >> } >> >> /* >> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >> ret = iomap_dio_complete(dio); >> } >> >> + f2fs_up_read(&fi->i_opu_rwsem); >> f2fs_up_read(&fi->i_gc_rwsem[READ]); >> >> file_accessed(file); >> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) >> if (f2fs_should_use_dio(inode, iocb, to)) { >> ret = f2fs_dio_read_iter(iocb, to); >> } else { >> + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode)); >> + >> + if (do_opu) >> + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem); >> ret = filemap_read(iocb, to, 0); >> + if (do_opu) >> + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem); >> if (ret > 0) >> f2fs_update_iostat(F2FS_I_SB(inode), inode, >> APP_BUFFERED_READ_IO, ret); >> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, >> ret = -EAGAIN; >> goto out; >> } >> + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) { >> + f2fs_up_read(&fi->i_gc_rwsem[READ]); >> + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); >> + ret = -EAGAIN; >> + goto out; >> + } >> } else { >> ret = f2fs_convert_inline_inode(inode); >> if (ret) >> goto out; >> >> f2fs_down_read(&fi->i_gc_rwsem[WRITE]); >> - if (do_opu) >> + if (do_opu) { >> f2fs_down_read(&fi->i_gc_rwsem[READ]); >> + f2fs_down_write(&fi->i_opu_rwsem); >> + } >> } >> >> /* >> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, >> ret = iomap_dio_complete(dio); >> } >> >> - if (do_opu) >> + if (do_opu) { >> + f2fs_up_write(&fi->i_opu_rwsem); >> f2fs_up_read(&fi->i_gc_rwsem[READ]); >> + } >> f2fs_up_read(&fi->i_gc_rwsem[WRITE]); >> >> if (ret < 0) >> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c >> index daf2c4dbe150..b4ed3b094366 100644 >> --- a/fs/f2fs/super.c >> +++ b/fs/f2fs/super.c >> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) >> init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); >> init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); >> init_f2fs_rwsem(&fi->i_xattr_sem); >> + init_f2fs_rwsem(&fi->i_opu_rwsem); >> >> /* Will be used by directory only */ >> fi->i_dir_level = F2FS_SB(sb)->dir_level; >> -- >> 2.40.1
On 05/15, Chao Yu wrote: > On 2024/5/15 0:09, Jaegeuk Kim wrote: > > On 05/10, Chao Yu wrote: > > > If lfs mode is on, buffered read may race w/ OPU dio write as below, > > > it may cause buffered read hits unwritten data unexpectly, and for > > > dio read, the race condition exists as well. > > > > > > Thread A Thread B > > > - f2fs_file_write_iter > > > - f2fs_dio_write_iter > > > - __iomap_dio_rw > > > - f2fs_iomap_begin > > > - f2fs_map_blocks > > > - __allocate_data_block > > > - allocated blkaddr #x > > > - iomap_dio_submit_bio > > > - f2fs_file_read_iter > > > - filemap_read > > > - f2fs_read_data_folio > > > - f2fs_mpage_readpages > > > - f2fs_map_blocks > > > : get blkaddr #x > > > - f2fs_submit_read_bio > > > IRQ > > > - f2fs_read_end_io > > > : read IO on blkaddr #x complete > > > IRQ > > > - iomap_dio_bio_end_io > > > : direct write IO on blkaddr #x complete > > > > > > This patch introduces a new per-inode i_opu_rwsem lock to avoid > > > such race condition. > > > > Wasn't this supposed to be managed by user-land? > > Actually, the test case is: > > 1. mount w/ lfs mode > 2. touch file; > 3. initialize file w/ 4k zeroed data; fsync; > 4. continue triggering dio write 4k zeroed data to file; > 5. and meanwhile, continue triggering buf/dio 4k read in file, > use md5sum to verify the 4k data; > > It expects data is all zero, however it turned out it's not. Can we check outstanding write bios instead of abusing locks? > > Thanks, > > > > > > > > > Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") > > > Signed-off-by: Chao Yu <chao@kernel.org> > > > --- > > > v2: > > > - fix to cover dio read path w/ i_opu_rwsem as well. > > > fs/f2fs/f2fs.h | 1 + > > > fs/f2fs/file.c | 28 ++++++++++++++++++++++++++-- > > > fs/f2fs/super.c | 1 + > > > 3 files changed, 28 insertions(+), 2 deletions(-) > > > > > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > > > index 30058e16a5d0..91cf4b3d6bc6 100644 > > > --- a/fs/f2fs/f2fs.h > > > +++ b/fs/f2fs/f2fs.h > > > @@ -847,6 +847,7 @@ struct f2fs_inode_info { > > > /* avoid racing between foreground op and gc */ > > > struct f2fs_rwsem i_gc_rwsem[2]; > > > struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ > > > + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */ > > > > > > int i_extra_isize; /* size of extra space located in i_addr */ > > > kprojid_t i_projid; /* id for project quota */ > > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c > > > index 72ce1a522fb2..4ec260af321f 100644 > > > --- a/fs/f2fs/file.c > > > +++ b/fs/f2fs/file.c > > > @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > > > const loff_t pos = iocb->ki_pos; > > > const size_t count = iov_iter_count(to); > > > struct iomap_dio *dio; > > > + bool do_opu = f2fs_lfs_mode(sbi); > > > ssize_t ret; > > > > > > if (count == 0) > > > @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > > > ret = -EAGAIN; > > > goto out; > > > } > > > + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) { > > > + f2fs_up_read(&fi->i_gc_rwsem[READ]); > > > + ret = -EAGAIN; > > > + goto out; > > > + } > > > } else { > > > f2fs_down_read(&fi->i_gc_rwsem[READ]); > > > + f2fs_down_read(&fi->i_opu_rwsem); > > > } > > > > > > /* > > > @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > > > ret = iomap_dio_complete(dio); > > > } > > > > > > + f2fs_up_read(&fi->i_opu_rwsem); > > > f2fs_up_read(&fi->i_gc_rwsem[READ]); > > > > > > file_accessed(file); > > > @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > > > if (f2fs_should_use_dio(inode, iocb, to)) { > > > ret = f2fs_dio_read_iter(iocb, to); > > > } else { > > > + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode)); > > > + > > > + if (do_opu) > > > + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem); > > > ret = filemap_read(iocb, to, 0); > > > + if (do_opu) > > > + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem); > > > if (ret > 0) > > > f2fs_update_iostat(F2FS_I_SB(inode), inode, > > > APP_BUFFERED_READ_IO, ret); > > > @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, > > > ret = -EAGAIN; > > > goto out; > > > } > > > + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) { > > > + f2fs_up_read(&fi->i_gc_rwsem[READ]); > > > + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); > > > + ret = -EAGAIN; > > > + goto out; > > > + } > > > } else { > > > ret = f2fs_convert_inline_inode(inode); > > > if (ret) > > > goto out; > > > > > > f2fs_down_read(&fi->i_gc_rwsem[WRITE]); > > > - if (do_opu) > > > + if (do_opu) { > > > f2fs_down_read(&fi->i_gc_rwsem[READ]); > > > + f2fs_down_write(&fi->i_opu_rwsem); > > > + } > > > } > > > > > > /* > > > @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, > > > ret = iomap_dio_complete(dio); > > > } > > > > > > - if (do_opu) > > > + if (do_opu) { > > > + f2fs_up_write(&fi->i_opu_rwsem); > > > f2fs_up_read(&fi->i_gc_rwsem[READ]); > > > + } > > > f2fs_up_read(&fi->i_gc_rwsem[WRITE]); > > > > > > if (ret < 0) > > > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c > > > index daf2c4dbe150..b4ed3b094366 100644 > > > --- a/fs/f2fs/super.c > > > +++ b/fs/f2fs/super.c > > > @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) > > > init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); > > > init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); > > > init_f2fs_rwsem(&fi->i_xattr_sem); > > > + init_f2fs_rwsem(&fi->i_opu_rwsem); > > > > > > /* Will be used by directory only */ > > > fi->i_dir_level = F2FS_SB(sb)->dir_level; > > > -- > > > 2.40.1
On 2024/5/15 12:42, Jaegeuk Kim wrote: > On 05/15, Chao Yu wrote: >> On 2024/5/15 0:09, Jaegeuk Kim wrote: >>> On 05/10, Chao Yu wrote: >>>> If lfs mode is on, buffered read may race w/ OPU dio write as below, >>>> it may cause buffered read hits unwritten data unexpectly, and for >>>> dio read, the race condition exists as well. >>>> >>>> Thread A Thread B >>>> - f2fs_file_write_iter >>>> - f2fs_dio_write_iter >>>> - __iomap_dio_rw >>>> - f2fs_iomap_begin >>>> - f2fs_map_blocks >>>> - __allocate_data_block >>>> - allocated blkaddr #x >>>> - iomap_dio_submit_bio >>>> - f2fs_file_read_iter >>>> - filemap_read >>>> - f2fs_read_data_folio >>>> - f2fs_mpage_readpages >>>> - f2fs_map_blocks >>>> : get blkaddr #x >>>> - f2fs_submit_read_bio >>>> IRQ >>>> - f2fs_read_end_io >>>> : read IO on blkaddr #x complete >>>> IRQ >>>> - iomap_dio_bio_end_io >>>> : direct write IO on blkaddr #x complete >>>> >>>> This patch introduces a new per-inode i_opu_rwsem lock to avoid >>>> such race condition. >>> >>> Wasn't this supposed to be managed by user-land? >> >> Actually, the test case is: >> >> 1. mount w/ lfs mode >> 2. touch file; >> 3. initialize file w/ 4k zeroed data; fsync; >> 4. continue triggering dio write 4k zeroed data to file; >> 5. and meanwhile, continue triggering buf/dio 4k read in file, >> use md5sum to verify the 4k data; >> >> It expects data is all zero, however it turned out it's not. > > Can we check outstanding write bios instead of abusing locks? I didn't figure out a way to solve this w/o lock, due to: - write bios can be issued after outstanding write bios check condition, result in the race. - once read() detects that there are outstanding write bios, we need to delay read flow rather than fail it, right? It looks using a lock is more proper here? Any suggestion? Thanks, > >> >> Thanks, >> >>> >>>> >>>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") >>>> Signed-off-by: Chao Yu <chao@kernel.org> >>>> --- >>>> v2: >>>> - fix to cover dio read path w/ i_opu_rwsem as well. >>>> fs/f2fs/f2fs.h | 1 + >>>> fs/f2fs/file.c | 28 ++++++++++++++++++++++++++-- >>>> fs/f2fs/super.c | 1 + >>>> 3 files changed, 28 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h >>>> index 30058e16a5d0..91cf4b3d6bc6 100644 >>>> --- a/fs/f2fs/f2fs.h >>>> +++ b/fs/f2fs/f2fs.h >>>> @@ -847,6 +847,7 @@ struct f2fs_inode_info { >>>> /* avoid racing between foreground op and gc */ >>>> struct f2fs_rwsem i_gc_rwsem[2]; >>>> struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ >>>> + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */ >>>> >>>> int i_extra_isize; /* size of extra space located in i_addr */ >>>> kprojid_t i_projid; /* id for project quota */ >>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c >>>> index 72ce1a522fb2..4ec260af321f 100644 >>>> --- a/fs/f2fs/file.c >>>> +++ b/fs/f2fs/file.c >>>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>> const loff_t pos = iocb->ki_pos; >>>> const size_t count = iov_iter_count(to); >>>> struct iomap_dio *dio; >>>> + bool do_opu = f2fs_lfs_mode(sbi); >>>> ssize_t ret; >>>> >>>> if (count == 0) >>>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>> ret = -EAGAIN; >>>> goto out; >>>> } >>>> + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) { >>>> + f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>> + ret = -EAGAIN; >>>> + goto out; >>>> + } >>>> } else { >>>> f2fs_down_read(&fi->i_gc_rwsem[READ]); >>>> + f2fs_down_read(&fi->i_opu_rwsem); >>>> } >>>> >>>> /* >>>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>> ret = iomap_dio_complete(dio); >>>> } >>>> >>>> + f2fs_up_read(&fi->i_opu_rwsem); >>>> f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>> >>>> file_accessed(file); >>>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>> if (f2fs_should_use_dio(inode, iocb, to)) { >>>> ret = f2fs_dio_read_iter(iocb, to); >>>> } else { >>>> + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode)); >>>> + >>>> + if (do_opu) >>>> + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem); >>>> ret = filemap_read(iocb, to, 0); >>>> + if (do_opu) >>>> + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem); >>>> if (ret > 0) >>>> f2fs_update_iostat(F2FS_I_SB(inode), inode, >>>> APP_BUFFERED_READ_IO, ret); >>>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, >>>> ret = -EAGAIN; >>>> goto out; >>>> } >>>> + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) { >>>> + f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>> + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); >>>> + ret = -EAGAIN; >>>> + goto out; >>>> + } >>>> } else { >>>> ret = f2fs_convert_inline_inode(inode); >>>> if (ret) >>>> goto out; >>>> >>>> f2fs_down_read(&fi->i_gc_rwsem[WRITE]); >>>> - if (do_opu) >>>> + if (do_opu) { >>>> f2fs_down_read(&fi->i_gc_rwsem[READ]); >>>> + f2fs_down_write(&fi->i_opu_rwsem); >>>> + } >>>> } >>>> >>>> /* >>>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, >>>> ret = iomap_dio_complete(dio); >>>> } >>>> >>>> - if (do_opu) >>>> + if (do_opu) { >>>> + f2fs_up_write(&fi->i_opu_rwsem); >>>> f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>> + } >>>> f2fs_up_read(&fi->i_gc_rwsem[WRITE]); >>>> >>>> if (ret < 0) >>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c >>>> index daf2c4dbe150..b4ed3b094366 100644 >>>> --- a/fs/f2fs/super.c >>>> +++ b/fs/f2fs/super.c >>>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) >>>> init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); >>>> init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); >>>> init_f2fs_rwsem(&fi->i_xattr_sem); >>>> + init_f2fs_rwsem(&fi->i_opu_rwsem); >>>> >>>> /* Will be used by directory only */ >>>> fi->i_dir_level = F2FS_SB(sb)->dir_level; >>>> -- >>>> 2.40.1
On Fri, May 10, 2024 at 10:39:06AM +0800, Chao Yu wrote: > If lfs mode is on, buffered read may race w/ OPU dio write as below, > it may cause buffered read hits unwritten data unexpectly, and for > dio read, the race condition exists as well. > > Thread A Thread B > - f2fs_file_write_iter > - f2fs_dio_write_iter > - __iomap_dio_rw > - f2fs_iomap_begin > - f2fs_map_blocks > - __allocate_data_block > - allocated blkaddr #x > - iomap_dio_submit_bio > - f2fs_file_read_iter > - filemap_read > - f2fs_read_data_folio > - f2fs_mpage_readpages > - f2fs_map_blocks > : get blkaddr #x > - f2fs_submit_read_bio > IRQ > - f2fs_read_end_io > : read IO on blkaddr #x complete > IRQ > - iomap_dio_bio_end_io > : direct write IO on blkaddr #x complete > Looks like every COW filesystem would meet this situation. What's the solution of other FS? > This patch introduces a new per-inode i_opu_rwsem lock to avoid > such race condition. >
…
> This patch introduces a new …
Please choose a corresponding imperative wording for an improved change description.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.9#n94
Regards,
Markus
Hello, kernel test robot noticed "WARNING:at_kernel/locking/rwsem.c:#down_read" on: commit: abf7df61e5c60fed520a09102d576fd41ed4d5ee ("[PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write") url: https://github.com/intel-lab-lkp/linux/commits/Chao-Yu/f2fs-fix-to-avoid-racing-in-between-read-and-OPU-dio-write/20240510-104020 base: https://git.kernel.org/cgit/linux/kernel/git/jaegeuk/f2fs.git dev patch link: https://lore.kernel.org/all/20240510023906.281700-1-chao@kernel.org/ patch subject: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write in testcase: xfstests version: xfstests-x86_64-0e5c12df-1_20240511 with following parameters: disk: 4HDD fs: f2fs test: generic-617 compiler: gcc-13 test machine: 4 threads Intel(R) Core(TM) i5-6500 CPU @ 3.20GHz (Skylake) with 32G memory (please refer to attached dmesg/kmsg for entire log/backtrace) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202405171532.760e22d3-oliver.sang@intel.com [ 160.985543][ T2255] ------------[ cut here ]------------ [ 160.990864][ T2255] WARNING: CPU: 3 PID: 2255 at kernel/locking/rwsem.c:245 down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) [ 160.999715][ T2255] Modules linked in: f2fs crc32_generic intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal btrfs intel_powerclamp blake2b_generic coretemp xor zstd_compress kvm_intel raid6_pq libcrc32c i915 kvm sd_mod t10_pi crc64_rocksoft_generic crct10dif_pclmul crc32_pclmul crc64_rocksoft crc64 crc32c_intel ghash_clmulni_intel sha512_ssse3 sg drm_buddy rapl ipmi_devintf intel_cstate intel_gtt ipmi_msghandler mei_wdt wmi_bmof intel_uncore i2c_i801 drm_display_helper i2c_smbus ahci ttm drm_kms_helper libahci video libata mei_me mei acpi_pad intel_pch_thermal wmi binfmt_misc loop fuse drm dm_mod ip_tables [ 161.053654][ T2255] CPU: 3 PID: 2255 Comm: fsx Tainted: G I 6.9.0-rc1-00036-gabf7df61e5c6 #1 [ 161.063438][ T2255] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.1.1 10/07/2015 [ 161.071504][ T2255] RIP: 0010:down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) [ 161.076376][ T2255] Code: b8 00 00 00 00 00 fc ff df 83 e3 02 48 c1 ea 03 4c 09 fb 48 83 cb 01 80 3c 02 00 0f 85 9a 00 00 00 48 89 5d 08 e9 50 ff ff ff <0f> 0b 4c 8d 7d 08 be 08 00 00 00 4c 89 ff e8 c1 59 e9 fd 4c 89 f8 All code ======== 0: b8 00 00 00 00 mov $0x0,%eax 5: 00 fc add %bh,%ah 7: ff (bad) 8: df 83 e3 02 48 c1 filds -0x3eb7fd1d(%rbx) e: ea (bad) f: 03 4c 09 fb add -0x5(%rcx,%rcx,1),%ecx 13: 48 83 cb 01 or $0x1,%rbx 17: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1) 1b: 0f 85 9a 00 00 00 jne 0xbb 21: 48 89 5d 08 mov %rbx,0x8(%rbp) 25: e9 50 ff ff ff jmpq 0xffffffffffffff7a 2a:* 0f 0b ud2 <-- trapping instruction 2c: 4c 8d 7d 08 lea 0x8(%rbp),%r15 30: be 08 00 00 00 mov $0x8,%esi 35: 4c 89 ff mov %r15,%rdi 38: e8 c1 59 e9 fd callq 0xfffffffffde959fe 3d: 4c 89 f8 mov %r15,%rax Code starting with the faulting instruction =========================================== 0: 0f 0b ud2 2: 4c 8d 7d 08 lea 0x8(%rbp),%r15 6: be 08 00 00 00 mov $0x8,%esi b: 4c 89 ff mov %r15,%rdi e: e8 c1 59 e9 fd callq 0xfffffffffde959d4 13: 4c 89 f8 mov %r15,%rax [ 161.095753][ T2255] RSP: 0018:ffffc90002c0f8b8 EFLAGS: 00010286 [ 161.101662][ T2255] RAX: fffffffffffffe00 RBX: fffffffffffffe00 RCX: ffffffff83bdf543 [ 161.109469][ T2255] RDX: ffffed10236632f0 RSI: 0000000000000008 RDI: ffff88811b319778 [ 161.117276][ T2255] RBP: ffff88811b319778 R08: 0000000000000001 R09: ffffed10236632ef [ 161.125080][ T2255] R10: ffff88811b31977f R11: ffffffff85fe96a2 R12: 1ffff92000581f18 [ 161.132885][ T2255] R13: dffffc0000000000 R14: ffffc90002c0fa70 R15: 0000000000000000 [ 161.140691][ T2255] FS: 00007f623f19d740(0000) GS:ffff8887e9380000(0000) knlGS:0000000000000000 [ 161.149446][ T2255] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 161.155871][ T2255] CR2: 00007f623f05b000 CR3: 00000001b2026006 CR4: 00000000003706f0 [ 161.163705][ T2255] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 161.171512][ T2255] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 161.179316][ T2255] Call Trace: [ 161.182456][ T2255] <TASK> [ 161.185254][ T2255] ? __warn (kernel/panic.c:694) [ 161.189181][ T2255] ? down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) [ 161.193456][ T2255] ? report_bug (lib/bug.c:180 lib/bug.c:219) [ 161.197814][ T2255] ? handle_bug (arch/x86/kernel/traps.c:239 (discriminator 1)) [ 161.202003][ T2255] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1)) [ 161.206532][ T2255] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621) [ 161.211414][ T2255] ? down_read (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2723 (discriminator 5) include/linux/atomic/atomic-long.h:163 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3298 (discriminator 5) kernel/locking/rwsem.c:243 (discriminator 5) kernel/locking/rwsem.c:1249 (discriminator 5) kernel/locking/rwsem.c:1263 (discriminator 5) kernel/locking/rwsem.c:1528 (discriminator 5)) [ 161.215600][ T2255] ? down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) [ 161.219854][ T2255] ? down_read (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2723 (discriminator 5) include/linux/atomic/atomic-long.h:163 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3298 (discriminator 5) kernel/locking/rwsem.c:243 (discriminator 5) kernel/locking/rwsem.c:1249 (discriminator 5) kernel/locking/rwsem.c:1263 (discriminator 5) kernel/locking/rwsem.c:1528 (discriminator 5)) [ 161.224034][ T2255] ? __rmqueue_pcplist (include/linux/list.h:215 (discriminator 1) include/linux/list.h:229 (discriminator 1) mm/page_alloc.c:2836 (discriminator 1)) [ 161.228904][ T2255] ? __pfx_down_read (kernel/locking/rwsem.c:1524) [ 161.233566][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) [ 161.238707][ T2255] f2fs_dio_read_iter (fs/f2fs/f2fs.h:2468 (discriminator 1) fs/f2fs/file.c:4477 (discriminator 1)) f2fs [ 161.244288][ T2255] f2fs_file_read_iter (fs/f2fs/file.c:4533) f2fs [ 161.249949][ T2255] copy_splice_read (include/linux/fs.h:2102 fs/splice.c:365) [ 161.254636][ T2255] ? __pfx_copy_splice_read (fs/splice.c:324) [ 161.259843][ T2255] splice_direct_to_actor (fs/splice.c:1136) [ 161.265045][ T2255] ? __pfx_direct_splice_actor (fs/splice.c:1159) [ 161.270507][ T2255] ? __pfx_splice_direct_to_actor (fs/splice.c:1032) [ 161.276229][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) [ 161.281359][ T2255] ? from_kgid_munged (kernel/user_namespace.c:527) [ 161.286131][ T2255] do_splice_direct (fs/splice.c:1208 fs/splice.c:1233) [ 161.290829][ T2255] ? __pfx_do_splice_direct (fs/splice.c:1232) [ 161.296047][ T2255] ? __pfx_direct_file_splice_eof (fs/splice.c:1178) [ 161.301779][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) [ 161.306894][ T2255] ? rw_verify_area (fs/read_write.c:377) [ 161.311507][ T2255] vfs_copy_file_range (fs/read_write.c:1558) [ 161.316552][ T2255] ? f2fs_getattr (arch/x86/include/asm/bitops.h:206 (discriminator 1) arch/x86/include/asm/bitops.h:238 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 1) fs/f2fs/f2fs.h:3056 (discriminator 1) fs/f2fs/f2fs.h:3311 (discriminator 1) fs/f2fs/file.c:904 (discriminator 1)) f2fs [ 161.321759][ T2255] ? __pfx_vfs_copy_file_range (fs/read_write.c:1486) [ 161.327234][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) [ 161.332362][ T2255] __do_sys_copy_file_range (fs/read_write.c:1612) [ 161.337755][ T2255] ? __pfx___do_sys_copy_file_range (fs/read_write.c:1578) [ 161.343661][ T2255] ? f2fs_llseek (arch/x86/include/asm/bitops.h:206 (discriminator 1) arch/x86/include/asm/bitops.h:238 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 1) fs/f2fs/f2fs.h:3056 (discriminator 1) fs/f2fs/f2fs.h:3210 (discriminator 1) fs/f2fs/file.c:518 (discriminator 1)) f2fs [ 161.348813][ T2255] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) [ 161.353151][ T2255] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) [ 161.358886][ T2255] RIP: 0033:0x7f623f2a1719 [ 161.363137][ T2255] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 06 0d 00 f7 d8 64 89 01 48 All code ======== 0: 08 89 e8 5b 5d c3 or %cl,-0x3ca2a418(%rcx) 6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) d: 00 00 00 10: 90 nop 11: 48 89 f8 mov %rdi,%rax 14: 48 89 f7 mov %rsi,%rdi 17: 48 89 d6 mov %rdx,%rsi 1a: 48 89 ca mov %rcx,%rdx 1d: 4d 89 c2 mov %r8,%r10 20: 4d 89 c8 mov %r9,%r8 23: 4c 8b 4c 24 08 mov 0x8(%rsp),%r9 28: 0f 05 syscall 2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction 30: 73 01 jae 0x33 32: c3 retq 33: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06f1 3a: f7 d8 neg %eax 3c: 64 89 01 mov %eax,%fs:(%rcx) 3f: 48 rex.W Code starting with the faulting instruction =========================================== 0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax 6: 73 01 jae 0x9 8: c3 retq 9: 48 8b 0d b7 06 0d 00 mov 0xd06b7(%rip),%rcx # 0xd06c7 10: f7 d8 neg %eax 12: 64 89 01 mov %eax,%fs:(%rcx) 15: 48 rex.W The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20240517/202405171532.760e22d3-oliver.sang@intel.com
On 2024/5/15 16:32, Wu Bo wrote: > On Fri, May 10, 2024 at 10:39:06AM +0800, Chao Yu wrote: >> If lfs mode is on, buffered read may race w/ OPU dio write as below, >> it may cause buffered read hits unwritten data unexpectly, and for >> dio read, the race condition exists as well. >> >> Thread A Thread B >> - f2fs_file_write_iter >> - f2fs_dio_write_iter >> - __iomap_dio_rw >> - f2fs_iomap_begin >> - f2fs_map_blocks >> - __allocate_data_block >> - allocated blkaddr #x >> - iomap_dio_submit_bio >> - f2fs_file_read_iter >> - filemap_read >> - f2fs_read_data_folio >> - f2fs_mpage_readpages >> - f2fs_map_blocks >> : get blkaddr #x >> - f2fs_submit_read_bio >> IRQ >> - f2fs_read_end_io >> : read IO on blkaddr #x complete >> IRQ >> - iomap_dio_bio_end_io >> : direct write IO on blkaddr #x complete >> > Looks like every COW filesystem would meet this situation. What's the solution > of other FS? I missed to reply this... Other cow filesystem like btrfs, it will update metadata after data IO completion, so it is safe. Thanks, >> This patch introduces a new per-inode i_opu_rwsem lock to avoid >> such race condition. >>
On 2024/5/15 14:38, Chao Yu wrote: > On 2024/5/15 12:42, Jaegeuk Kim wrote: >> On 05/15, Chao Yu wrote: >>> On 2024/5/15 0:09, Jaegeuk Kim wrote: >>>> On 05/10, Chao Yu wrote: >>>>> If lfs mode is on, buffered read may race w/ OPU dio write as below, >>>>> it may cause buffered read hits unwritten data unexpectly, and for >>>>> dio read, the race condition exists as well. >>>>> >>>>> Thread A Thread B >>>>> - f2fs_file_write_iter >>>>> - f2fs_dio_write_iter >>>>> - __iomap_dio_rw >>>>> - f2fs_iomap_begin >>>>> - f2fs_map_blocks >>>>> - __allocate_data_block >>>>> - allocated blkaddr #x >>>>> - iomap_dio_submit_bio >>>>> - f2fs_file_read_iter >>>>> - filemap_read >>>>> - f2fs_read_data_folio >>>>> - f2fs_mpage_readpages >>>>> - f2fs_map_blocks >>>>> : get blkaddr #x >>>>> - f2fs_submit_read_bio >>>>> IRQ >>>>> - f2fs_read_end_io >>>>> : read IO on blkaddr #x complete >>>>> IRQ >>>>> - iomap_dio_bio_end_io >>>>> : direct write IO on blkaddr #x complete >>>>> >>>>> This patch introduces a new per-inode i_opu_rwsem lock to avoid >>>>> such race condition. >>>> >>>> Wasn't this supposed to be managed by user-land? >>> >>> Actually, the test case is: >>> >>> 1. mount w/ lfs mode >>> 2. touch file; >>> 3. initialize file w/ 4k zeroed data; fsync; >>> 4. continue triggering dio write 4k zeroed data to file; >>> 5. and meanwhile, continue triggering buf/dio 4k read in file, >>> use md5sum to verify the 4k data; >>> >>> It expects data is all zero, however it turned out it's not. >> >> Can we check outstanding write bios instead of abusing locks? Jaegeuk, seems it can solve partial race cases, not all of them. Do you suggest to use this compromised solution? Thanks, > > I didn't figure out a way to solve this w/o lock, due to: > - write bios can be issued after outstanding write bios check condition, > result in the race. > - once read() detects that there are outstanding write bios, we need to > delay read flow rather than fail it, right? It looks using a lock is more > proper here? > > Any suggestion? > > Thanks, > >> >>> >>> Thanks, >>> >>>> >>>>> >>>>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") >>>>> Signed-off-by: Chao Yu <chao@kernel.org> >>>>> --- >>>>> v2: >>>>> - fix to cover dio read path w/ i_opu_rwsem as well. >>>>> fs/f2fs/f2fs.h | 1 + >>>>> fs/f2fs/file.c | 28 ++++++++++++++++++++++++++-- >>>>> fs/f2fs/super.c | 1 + >>>>> 3 files changed, 28 insertions(+), 2 deletions(-) >>>>> >>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h >>>>> index 30058e16a5d0..91cf4b3d6bc6 100644 >>>>> --- a/fs/f2fs/f2fs.h >>>>> +++ b/fs/f2fs/f2fs.h >>>>> @@ -847,6 +847,7 @@ struct f2fs_inode_info { >>>>> /* avoid racing between foreground op and gc */ >>>>> struct f2fs_rwsem i_gc_rwsem[2]; >>>>> struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ >>>>> + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */ >>>>> >>>>> int i_extra_isize; /* size of extra space located in i_addr */ >>>>> kprojid_t i_projid; /* id for project quota */ >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c >>>>> index 72ce1a522fb2..4ec260af321f 100644 >>>>> --- a/fs/f2fs/file.c >>>>> +++ b/fs/f2fs/file.c >>>>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>>> const loff_t pos = iocb->ki_pos; >>>>> const size_t count = iov_iter_count(to); >>>>> struct iomap_dio *dio; >>>>> + bool do_opu = f2fs_lfs_mode(sbi); >>>>> ssize_t ret; >>>>> >>>>> if (count == 0) >>>>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>>> ret = -EAGAIN; >>>>> goto out; >>>>> } >>>>> + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) { >>>>> + f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>>> + ret = -EAGAIN; >>>>> + goto out; >>>>> + } >>>>> } else { >>>>> f2fs_down_read(&fi->i_gc_rwsem[READ]); >>>>> + f2fs_down_read(&fi->i_opu_rwsem); >>>>> } >>>>> >>>>> /* >>>>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>>> ret = iomap_dio_complete(dio); >>>>> } >>>>> >>>>> + f2fs_up_read(&fi->i_opu_rwsem); >>>>> f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>>> >>>>> file_accessed(file); >>>>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) >>>>> if (f2fs_should_use_dio(inode, iocb, to)) { >>>>> ret = f2fs_dio_read_iter(iocb, to); >>>>> } else { >>>>> + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode)); >>>>> + >>>>> + if (do_opu) >>>>> + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem); >>>>> ret = filemap_read(iocb, to, 0); >>>>> + if (do_opu) >>>>> + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem); >>>>> if (ret > 0) >>>>> f2fs_update_iostat(F2FS_I_SB(inode), inode, >>>>> APP_BUFFERED_READ_IO, ret); >>>>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, >>>>> ret = -EAGAIN; >>>>> goto out; >>>>> } >>>>> + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) { >>>>> + f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>>> + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); >>>>> + ret = -EAGAIN; >>>>> + goto out; >>>>> + } >>>>> } else { >>>>> ret = f2fs_convert_inline_inode(inode); >>>>> if (ret) >>>>> goto out; >>>>> >>>>> f2fs_down_read(&fi->i_gc_rwsem[WRITE]); >>>>> - if (do_opu) >>>>> + if (do_opu) { >>>>> f2fs_down_read(&fi->i_gc_rwsem[READ]); >>>>> + f2fs_down_write(&fi->i_opu_rwsem); >>>>> + } >>>>> } >>>>> >>>>> /* >>>>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, >>>>> ret = iomap_dio_complete(dio); >>>>> } >>>>> >>>>> - if (do_opu) >>>>> + if (do_opu) { >>>>> + f2fs_up_write(&fi->i_opu_rwsem); >>>>> f2fs_up_read(&fi->i_gc_rwsem[READ]); >>>>> + } >>>>> f2fs_up_read(&fi->i_gc_rwsem[WRITE]); >>>>> >>>>> if (ret < 0) >>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c >>>>> index daf2c4dbe150..b4ed3b094366 100644 >>>>> --- a/fs/f2fs/super.c >>>>> +++ b/fs/f2fs/super.c >>>>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) >>>>> init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); >>>>> init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); >>>>> init_f2fs_rwsem(&fi->i_xattr_sem); >>>>> + init_f2fs_rwsem(&fi->i_opu_rwsem); >>>>> >>>>> /* Will be used by directory only */ >>>>> fi->i_dir_level = F2FS_SB(sb)->dir_level; >>>>> -- >>>>> 2.40.1 > > > _______________________________________________ > Linux-f2fs-devel mailing list > Linux-f2fs-devel@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 30058e16a5d0..91cf4b3d6bc6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -847,6 +847,7 @@ struct f2fs_inode_info { /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ + struct f2fs_rwsem i_opu_rwsem; /* avoid racing between buf read and opu dio write */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 72ce1a522fb2..4ec260af321f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) const loff_t pos = iocb->ki_pos; const size_t count = iov_iter_count(to); struct iomap_dio *dio; + bool do_opu = f2fs_lfs_mode(sbi); ssize_t ret; if (count == 0) @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) ret = -EAGAIN; goto out; } + if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) { + f2fs_up_read(&fi->i_gc_rwsem[READ]); + ret = -EAGAIN; + goto out; + } } else { f2fs_down_read(&fi->i_gc_rwsem[READ]); + f2fs_down_read(&fi->i_opu_rwsem); } /* @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) ret = iomap_dio_complete(dio); } + f2fs_up_read(&fi->i_opu_rwsem); f2fs_up_read(&fi->i_gc_rwsem[READ]); file_accessed(file); @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (f2fs_should_use_dio(inode, iocb, to)) { ret = f2fs_dio_read_iter(iocb, to); } else { + bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode)); + + if (do_opu) + f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem); ret = filemap_read(iocb, to, 0); + if (do_opu) + f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem); if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_READ_IO, ret); @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret = -EAGAIN; goto out; } + if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) { + f2fs_up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + ret = -EAGAIN; + goto out; + } } else { ret = f2fs_convert_inline_inode(inode); if (ret) goto out; f2fs_down_read(&fi->i_gc_rwsem[WRITE]); - if (do_opu) + if (do_opu) { f2fs_down_read(&fi->i_gc_rwsem[READ]); + f2fs_down_write(&fi->i_opu_rwsem); + } } /* @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret = iomap_dio_complete(dio); } - if (do_opu) + if (do_opu) { + f2fs_up_write(&fi->i_opu_rwsem); f2fs_up_read(&fi->i_gc_rwsem[READ]); + } f2fs_up_read(&fi->i_gc_rwsem[WRITE]); if (ret < 0) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index daf2c4dbe150..b4ed3b094366 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); + init_f2fs_rwsem(&fi->i_opu_rwsem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level;
If lfs mode is on, buffered read may race w/ OPU dio write as below, it may cause buffered read hits unwritten data unexpectly, and for dio read, the race condition exists as well. Thread A Thread B - f2fs_file_write_iter - f2fs_dio_write_iter - __iomap_dio_rw - f2fs_iomap_begin - f2fs_map_blocks - __allocate_data_block - allocated blkaddr #x - iomap_dio_submit_bio - f2fs_file_read_iter - filemap_read - f2fs_read_data_folio - f2fs_mpage_readpages - f2fs_map_blocks : get blkaddr #x - f2fs_submit_read_bio IRQ - f2fs_read_end_io : read IO on blkaddr #x complete IRQ - iomap_dio_bio_end_io : direct write IO on blkaddr #x complete This patch introduces a new per-inode i_opu_rwsem lock to avoid such race condition. Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode") Signed-off-by: Chao Yu <chao@kernel.org> --- v2: - fix to cover dio read path w/ i_opu_rwsem as well. fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 28 ++++++++++++++++++++++++++-- fs/f2fs/super.c | 1 + 3 files changed, 28 insertions(+), 2 deletions(-)