Message ID | 20221101003412.3842572-7-david@fromorbit.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/7] xfs: write page faults in iomap are not buffered writes | expand |
Hi Dave, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on xfs-linux/for-next] [also build test WARNING on linus/master v6.1-rc3 next-20221101] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Dave-Chinner/xfs-write-page-faults-in-iomap-are-not-buffered-writes/20221101-104935 base: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next patch link: https://lore.kernel.org/r/20221101003412.3842572-7-david%40fromorbit.com patch subject: [PATCH 6/7] xfs: use iomap_valid method to detect stale cached iomaps config: riscv-randconfig-r042-20221101 compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 791a7ae1ba3efd6bca96338e10ffde557ba83920) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # install riscv cross compiling tool for clang build # apt-get install binutils-riscv64-linux-gnu # https://github.com/intel-lab-lkp/linux/commit/4a183febef8e4f12f2a256fb67bc3021122b662f git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Dave-Chinner/xfs-write-page-faults-in-iomap-are-not-buffered-writes/20221101-104935 git checkout 4a183febef8e4f12f2a256fb67bc3021122b662f # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash fs/xfs/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> All warnings (new ones prefixed by >>): >> fs/xfs/xfs_pnfs.c:183:6: warning: variable 'seq' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] if (!error && write && ^~~~~~~~~~~~~~~~~~ fs/xfs/xfs_pnfs.c:213:52: note: uninitialized use occurs here error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); ^~~ fs/xfs/xfs_pnfs.c:183:2: note: remove the 'if' if its condition is always true if (!error && write && ^~~~~~~~~~~~~~~~~~~~~~ >> fs/xfs/xfs_pnfs.c:183:6: warning: variable 'seq' is used uninitialized whenever '&&' condition is false [-Wsometimes-uninitialized] if (!error && write && ^~~~~~~~~~~~~~~ fs/xfs/xfs_pnfs.c:213:52: note: uninitialized use occurs here error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); ^~~ fs/xfs/xfs_pnfs.c:183:6: note: remove the '&&' if its condition is always true if (!error && write && ^~~~~~~~~~~~~~~~~~ >> fs/xfs/xfs_pnfs.c:183:6: warning: variable 'seq' is used uninitialized whenever '&&' condition is false [-Wsometimes-uninitialized] if (!error && write && ^~~~~~ fs/xfs/xfs_pnfs.c:213:52: note: uninitialized use occurs here error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); ^~~ fs/xfs/xfs_pnfs.c:183:6: note: remove the '&&' if its condition is always true if (!error && write && ^~~~~~~~~ fs/xfs/xfs_pnfs.c:128:11: note: initialize the variable 'seq' to silence this warning int seq; ^ = 0 3 warnings generated. vim +183 fs/xfs/xfs_pnfs.c b39a04636fd7454 Dave Chinner 2022-01-31 106 527851124d10f9c Christoph Hellwig 2015-02-16 107 /* 527851124d10f9c Christoph Hellwig 2015-02-16 108 * Get a layout for the pNFS client. 527851124d10f9c Christoph Hellwig 2015-02-16 109 */ 527851124d10f9c Christoph Hellwig 2015-02-16 110 int 527851124d10f9c Christoph Hellwig 2015-02-16 111 xfs_fs_map_blocks( 527851124d10f9c Christoph Hellwig 2015-02-16 112 struct inode *inode, 527851124d10f9c Christoph Hellwig 2015-02-16 113 loff_t offset, 527851124d10f9c Christoph Hellwig 2015-02-16 114 u64 length, 527851124d10f9c Christoph Hellwig 2015-02-16 115 struct iomap *iomap, 527851124d10f9c Christoph Hellwig 2015-02-16 116 bool write, 527851124d10f9c Christoph Hellwig 2015-02-16 117 u32 *device_generation) 527851124d10f9c Christoph Hellwig 2015-02-16 118 { 527851124d10f9c Christoph Hellwig 2015-02-16 119 struct xfs_inode *ip = XFS_I(inode); 527851124d10f9c Christoph Hellwig 2015-02-16 120 struct xfs_mount *mp = ip->i_mount; 527851124d10f9c Christoph Hellwig 2015-02-16 121 struct xfs_bmbt_irec imap; 527851124d10f9c Christoph Hellwig 2015-02-16 122 xfs_fileoff_t offset_fsb, end_fsb; 527851124d10f9c Christoph Hellwig 2015-02-16 123 loff_t limit; 527851124d10f9c Christoph Hellwig 2015-02-16 124 int bmapi_flags = XFS_BMAPI_ENTIRE; 527851124d10f9c Christoph Hellwig 2015-02-16 125 int nimaps = 1; 527851124d10f9c Christoph Hellwig 2015-02-16 126 uint lock_flags; 527851124d10f9c Christoph Hellwig 2015-02-16 127 int error = 0; 4a183febef8e4f1 Dave Chinner 2022-11-01 128 int seq; 527851124d10f9c Christoph Hellwig 2015-02-16 129 75c8c50fa16a23f Dave Chinner 2021-08-18 130 if (xfs_is_shutdown(mp)) 527851124d10f9c Christoph Hellwig 2015-02-16 131 return -EIO; 527851124d10f9c Christoph Hellwig 2015-02-16 132 527851124d10f9c Christoph Hellwig 2015-02-16 133 /* 527851124d10f9c Christoph Hellwig 2015-02-16 134 * We can't export inodes residing on the realtime device. The realtime 527851124d10f9c Christoph Hellwig 2015-02-16 135 * device doesn't have a UUID to identify it, so the client has no way 527851124d10f9c Christoph Hellwig 2015-02-16 136 * to find it. 527851124d10f9c Christoph Hellwig 2015-02-16 137 */ 527851124d10f9c Christoph Hellwig 2015-02-16 138 if (XFS_IS_REALTIME_INODE(ip)) 527851124d10f9c Christoph Hellwig 2015-02-16 139 return -ENXIO; 527851124d10f9c Christoph Hellwig 2015-02-16 140 46eeb521b952471 Darrick J. Wong 2016-10-03 141 /* 46eeb521b952471 Darrick J. Wong 2016-10-03 142 * The pNFS block layout spec actually supports reflink like 46eeb521b952471 Darrick J. Wong 2016-10-03 143 * functionality, but the Linux pNFS server doesn't implement it yet. 46eeb521b952471 Darrick J. Wong 2016-10-03 144 */ 46eeb521b952471 Darrick J. Wong 2016-10-03 145 if (xfs_is_reflink_inode(ip)) 46eeb521b952471 Darrick J. Wong 2016-10-03 146 return -ENXIO; 46eeb521b952471 Darrick J. Wong 2016-10-03 147 527851124d10f9c Christoph Hellwig 2015-02-16 148 /* 527851124d10f9c Christoph Hellwig 2015-02-16 149 * Lock out any other I/O before we flush and invalidate the pagecache, 527851124d10f9c Christoph Hellwig 2015-02-16 150 * and then hand out a layout to the remote system. This is very 527851124d10f9c Christoph Hellwig 2015-02-16 151 * similar to direct I/O, except that the synchronization is much more 69eb5fa10eb283e Dan Williams 2018-03-20 152 * complicated. See the comment near xfs_break_leased_layouts 69eb5fa10eb283e Dan Williams 2018-03-20 153 * for a detailed explanation. 527851124d10f9c Christoph Hellwig 2015-02-16 154 */ 527851124d10f9c Christoph Hellwig 2015-02-16 155 xfs_ilock(ip, XFS_IOLOCK_EXCL); 527851124d10f9c Christoph Hellwig 2015-02-16 156 527851124d10f9c Christoph Hellwig 2015-02-16 157 error = -EINVAL; 527851124d10f9c Christoph Hellwig 2015-02-16 158 limit = mp->m_super->s_maxbytes; 527851124d10f9c Christoph Hellwig 2015-02-16 159 if (!write) 527851124d10f9c Christoph Hellwig 2015-02-16 160 limit = max(limit, round_up(i_size_read(inode), 527851124d10f9c Christoph Hellwig 2015-02-16 161 inode->i_sb->s_blocksize)); 527851124d10f9c Christoph Hellwig 2015-02-16 162 if (offset > limit) 527851124d10f9c Christoph Hellwig 2015-02-16 163 goto out_unlock; 527851124d10f9c Christoph Hellwig 2015-02-16 164 if (offset > limit - length) 527851124d10f9c Christoph Hellwig 2015-02-16 165 length = limit - offset; 527851124d10f9c Christoph Hellwig 2015-02-16 166 527851124d10f9c Christoph Hellwig 2015-02-16 167 error = filemap_write_and_wait(inode->i_mapping); 527851124d10f9c Christoph Hellwig 2015-02-16 168 if (error) 527851124d10f9c Christoph Hellwig 2015-02-16 169 goto out_unlock; 527851124d10f9c Christoph Hellwig 2015-02-16 170 error = invalidate_inode_pages2(inode->i_mapping); 527851124d10f9c Christoph Hellwig 2015-02-16 171 if (WARN_ON_ONCE(error)) 2bd3fa793aaa7e9 Christoph Hellwig 2020-11-11 172 goto out_unlock; 527851124d10f9c Christoph Hellwig 2015-02-16 173 527851124d10f9c Christoph Hellwig 2015-02-16 174 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); 527851124d10f9c Christoph Hellwig 2015-02-16 175 offset_fsb = XFS_B_TO_FSBT(mp, offset); 527851124d10f9c Christoph Hellwig 2015-02-16 176 527851124d10f9c Christoph Hellwig 2015-02-16 177 lock_flags = xfs_ilock_data_map_shared(ip); 527851124d10f9c Christoph Hellwig 2015-02-16 178 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 527851124d10f9c Christoph Hellwig 2015-02-16 179 &imap, &nimaps, bmapi_flags); 527851124d10f9c Christoph Hellwig 2015-02-16 180 88cdb7147b21b2d Christoph Hellwig 2019-10-30 181 ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); 88cdb7147b21b2d Christoph Hellwig 2019-10-30 182 e696663a97e89f8 Christoph Hellwig 2019-10-30 @183 if (!error && write && e696663a97e89f8 Christoph Hellwig 2019-10-30 184 (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) { e696663a97e89f8 Christoph Hellwig 2019-10-30 185 if (offset + length > XFS_ISIZE(ip)) e696663a97e89f8 Christoph Hellwig 2019-10-30 186 end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); e696663a97e89f8 Christoph Hellwig 2019-10-30 187 else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) e696663a97e89f8 Christoph Hellwig 2019-10-30 188 end_fsb = min(end_fsb, imap.br_startoff + e696663a97e89f8 Christoph Hellwig 2019-10-30 189 imap.br_blockcount); e696663a97e89f8 Christoph Hellwig 2019-10-30 190 xfs_iunlock(ip, lock_flags); e696663a97e89f8 Christoph Hellwig 2019-10-30 191 e696663a97e89f8 Christoph Hellwig 2019-10-30 192 error = xfs_iomap_write_direct(ip, offset_fsb, 4a183febef8e4f1 Dave Chinner 2022-11-01 193 end_fsb - offset_fsb, 0, &imap, &seq); 527851124d10f9c Christoph Hellwig 2015-02-16 194 if (error) 527851124d10f9c Christoph Hellwig 2015-02-16 195 goto out_unlock; 527851124d10f9c Christoph Hellwig 2015-02-16 196 527851124d10f9c Christoph Hellwig 2015-02-16 197 /* 307cdb54b80e036 Christoph Hellwig 2019-10-30 198 * Ensure the next transaction is committed synchronously so 307cdb54b80e036 Christoph Hellwig 2019-10-30 199 * that the blocks allocated and handed out to the client are 307cdb54b80e036 Christoph Hellwig 2019-10-30 200 * guaranteed to be present even after a server crash. 527851124d10f9c Christoph Hellwig 2015-02-16 201 */ b39a04636fd7454 Dave Chinner 2022-01-31 202 error = xfs_fs_map_update_inode(ip); 472c6e46f589c26 Dave Chinner 2022-01-31 203 if (!error) 472c6e46f589c26 Dave Chinner 2022-01-31 204 error = xfs_log_force_inode(ip); 527851124d10f9c Christoph Hellwig 2015-02-16 205 if (error) 527851124d10f9c Christoph Hellwig 2015-02-16 206 goto out_unlock; 472c6e46f589c26 Dave Chinner 2022-01-31 207 e696663a97e89f8 Christoph Hellwig 2019-10-30 208 } else { e696663a97e89f8 Christoph Hellwig 2019-10-30 209 xfs_iunlock(ip, lock_flags); 527851124d10f9c Christoph Hellwig 2015-02-16 210 } 527851124d10f9c Christoph Hellwig 2015-02-16 211 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 527851124d10f9c Christoph Hellwig 2015-02-16 212 4a183febef8e4f1 Dave Chinner 2022-11-01 213 error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); 527851124d10f9c Christoph Hellwig 2015-02-16 214 *device_generation = mp->m_generation; 527851124d10f9c Christoph Hellwig 2015-02-16 215 return error; 527851124d10f9c Christoph Hellwig 2015-02-16 216 out_unlock: 527851124d10f9c Christoph Hellwig 2015-02-16 217 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 527851124d10f9c Christoph Hellwig 2015-02-16 218 return error; 527851124d10f9c Christoph Hellwig 2015-02-16 219 } 527851124d10f9c Christoph Hellwig 2015-02-16 220
> + *((int *)&iomap->private) = sequence; > +static bool > +xfs_buffered_write_iomap_valid( > + struct inode *inode, > + const struct iomap *iomap) > +{ > + int seq = *((int *)&iomap->private); I really hate this stuffing of the sequence into the private pointer. The iomap structure isn't so size constrained that we have to do that, so we can just add a sequence number field directly to it. I don't think that is a layering violation, as the concept of a sequence numebr is pretty generic and we'll probably need it for all file systems eventually. > + > + if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq)) > + return false; Which makes me wonder if we could do away with the callback entirely by adding an option sequence number pointer to the iomap_iter. If set the core code compares it against iomap->seq and we get rid of the per-folio indirect call, and boilerplate code that would need to be implemented in every file system.
On Tue, Nov 01, 2022 at 11:34:11AM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > Now that iomap supports a mechanism to validate cached iomaps for > buffered write operations, hook it up to the XFS buffered write ops > so that we can avoid data corruptions that result from stale cached > iomaps. See: > > https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ > > or the ->iomap_valid() introduction commit for exact details of the > corruption vector. > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > --- > fs/xfs/libxfs/xfs_bmap.c | 4 +-- > fs/xfs/xfs_aops.c | 2 +- > fs/xfs/xfs_iomap.c | 69 ++++++++++++++++++++++++++++++---------- > fs/xfs/xfs_iomap.h | 4 +-- > fs/xfs/xfs_pnfs.c | 5 +-- > 5 files changed, 61 insertions(+), 23 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c > index 49d0d4ea63fc..db225130618c 100644 > --- a/fs/xfs/libxfs/xfs_bmap.c > +++ b/fs/xfs/libxfs/xfs_bmap.c > @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc( > * the extent. Just return the real extent at this offset. > */ > if (!isnullstartblock(bma.got.br_startblock)) { > - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); > *seq = READ_ONCE(ifp->if_seq); > + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); > goto out_trans_cancel; > } > > @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc( > XFS_STATS_INC(mp, xs_xstrat_quick); > > ASSERT(!isnullstartblock(bma.got.br_startblock)); > - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); > *seq = READ_ONCE(ifp->if_seq); > + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); > > if (whichfork == XFS_COW_FORK) Hmm. @ifp here could be the cow fork, which means *seq will be from the cow fork. This I think is going to cause problems in xfs_buffered_write_iomap_valid... > xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 5d1a995b15f8..ca5a9e45a48c 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -373,7 +373,7 @@ xfs_map_blocks( > isnullstartblock(imap.br_startblock)) > goto allocate_blocks; > > - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); > + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); > trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); > return 0; > allocate_blocks: > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > index 2d48fcc7bd6f..5053ffcf10fe 100644 > --- a/fs/xfs/xfs_iomap.c > +++ b/fs/xfs/xfs_iomap.c > @@ -54,7 +54,8 @@ xfs_bmbt_to_iomap( > struct iomap *iomap, > struct xfs_bmbt_irec *imap, > unsigned int mapping_flags, > - u16 iomap_flags) > + u16 iomap_flags, > + int sequence) Nit: The sequence numbers are unsigned int, not signed int. > { > struct xfs_mount *mp = ip->i_mount; > struct xfs_buftarg *target = xfs_inode_buftarg(ip); > @@ -91,6 +92,9 @@ xfs_bmbt_to_iomap( > if (xfs_ipincount(ip) && > (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) > iomap->flags |= IOMAP_F_DIRTY; > + > + /* The extent tree sequence is needed for iomap validity checking. */ > + *((int *)&iomap->private) = sequence; > return 0; > } > > @@ -195,7 +199,8 @@ xfs_iomap_write_direct( > xfs_fileoff_t offset_fsb, > xfs_fileoff_t count_fsb, > unsigned int flags, > - struct xfs_bmbt_irec *imap) > + struct xfs_bmbt_irec *imap, > + int *seq) > { > struct xfs_mount *mp = ip->i_mount; > struct xfs_trans *tp; > @@ -285,6 +290,8 @@ xfs_iomap_write_direct( > error = xfs_alert_fsblock_zero(ip, imap); > > out_unlock: > + if (seq) > + *seq = READ_ONCE(ip->i_df.if_seq); > xfs_iunlock(ip, XFS_ILOCK_EXCL); > return error; > > @@ -743,6 +750,7 @@ xfs_direct_write_iomap_begin( > bool shared = false; > u16 iomap_flags = 0; > unsigned int lockmode = XFS_ILOCK_SHARED; > + int seq; > > ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); > > @@ -811,9 +819,10 @@ xfs_direct_write_iomap_begin( > goto out_unlock; > } > > + seq = READ_ONCE(ip->i_df.if_seq); > xfs_iunlock(ip, lockmode); > trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); > - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); > + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); > > allocate_blocks: > error = -EAGAIN; > @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin( > xfs_iunlock(ip, lockmode); > > error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, > - flags, &imap); > + flags, &imap, &seq); > if (error) > return error; > > trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); > return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, > - iomap_flags | IOMAP_F_NEW); > + iomap_flags | IOMAP_F_NEW, seq); > > out_found_cow: > + seq = READ_ONCE(ip->i_df.if_seq); > xfs_iunlock(ip, lockmode); > length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); > trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); > if (imap.br_startblock != HOLESTARTBLOCK) { > - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); > + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); > if (error) > return error; > } > - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); > + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); Here we've found a mapping from the COW fork and we're encoding it into the struct iomap. Why is the sequence number from the *data* fork and not the COW fork? > > out_unlock: > if (lockmode) > @@ -915,6 +925,7 @@ xfs_buffered_write_iomap_begin( > int allocfork = XFS_DATA_FORK; > int error = 0; > unsigned int lockmode = XFS_ILOCK_EXCL; > + int seq; > > if (xfs_is_shutdown(mp)) > return -EIO; > @@ -1094,26 +1105,29 @@ xfs_buffered_write_iomap_begin( > * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch > * them out if the write happens to fail. > */ > + seq = READ_ONCE(ip->i_df.if_seq); > xfs_iunlock(ip, XFS_ILOCK_EXCL); > trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); > - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); > + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); > > found_imap: > + seq = READ_ONCE(ip->i_df.if_seq); > xfs_iunlock(ip, XFS_ILOCK_EXCL); > - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); > + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); > > found_cow: > + seq = READ_ONCE(ip->i_df.if_seq); Same question here. > xfs_iunlock(ip, XFS_ILOCK_EXCL); > if (imap.br_startoff <= offset_fsb) { > - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); > + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); > if (error) > return error; > return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, > - IOMAP_F_SHARED); > + IOMAP_F_SHARED, seq); > } > > xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); > - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); > + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); > > out_unlock: > xfs_iunlock(ip, XFS_ILOCK_EXCL); > @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end( > return 0; > } > > +/* > + * Check that the iomap passed to us is still valid for the given offset and > + * length. > + */ > +static bool > +xfs_buffered_write_iomap_valid( > + struct inode *inode, > + const struct iomap *iomap) > +{ > + int seq = *((int *)&iomap->private); > + > + if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq)) Why is it ok to sample the data fork's sequence number even if the mapping came from the COW fork? That doesn't make sense to me, conceptually. I definitely think it's buggy that the revalidation might not sample from the same sequence counter as the original measurement. My curiosity is now increased about why shouldn't we return the u32 counter and u32 pointer in the struct iomap so that we can keep the sample/revalidation straight in XFS, and iomap can avoid a second indirect call? > + return false; > + return true; > +} > + > const struct iomap_ops xfs_buffered_write_iomap_ops = { > .iomap_begin = xfs_buffered_write_iomap_begin, > .iomap_end = xfs_buffered_write_iomap_end, > + .iomap_valid = xfs_buffered_write_iomap_valid, > }; > > /* > @@ -1359,6 +1390,7 @@ xfs_read_iomap_begin( > int nimaps = 1, error = 0; > bool shared = false; > unsigned int lockmode = XFS_ILOCK_SHARED; > + int seq; > > ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); > > @@ -1372,13 +1404,14 @@ xfs_read_iomap_begin( > &nimaps, 0); > if (!error && (flags & IOMAP_REPORT)) > error = xfs_reflink_trim_around_shared(ip, &imap, &shared); > + seq = READ_ONCE(ip->i_df.if_seq); > xfs_iunlock(ip, lockmode); > > if (error) > return error; > trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); > return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, > - shared ? IOMAP_F_SHARED : 0); > + shared ? IOMAP_F_SHARED : 0, seq); > } > > const struct iomap_ops xfs_read_iomap_ops = { > @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin( > end_fsb = min(end_fsb, data_fsb); > xfs_trim_extent(&cmap, offset_fsb, end_fsb); > error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, > - IOMAP_F_SHARED); > + IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq)); Here we explicitly sample from the cow fork but the comparison is done against the data fork. That /probably/ doesn't matter for reads since you're not proposing that we revalidate on those. Should we? What happens if userspace hands us a large read() from an unwritten extent and the same dirty/writeback/reclaim sequence happens to the last folio in that read() range before read_iter actually gets there? > /* > * This is a COW extent, so we must probe the page cache > * because there could be dirty page cache being backed > @@ -1460,7 +1493,8 @@ xfs_seek_iomap_begin( > imap.br_state = XFS_EXT_NORM; > done: > xfs_trim_extent(&imap, offset_fsb, end_fsb); > - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); > + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, > + READ_ONCE(ip->i_df.if_seq)); > out_unlock: > xfs_iunlock(ip, lockmode); > return error; > @@ -1486,6 +1520,7 @@ xfs_xattr_iomap_begin( > struct xfs_bmbt_irec imap; > int nimaps = 1, error = 0; > unsigned lockmode; > + int seq; > > if (xfs_is_shutdown(mp)) > return -EIO; > @@ -1502,12 +1537,14 @@ xfs_xattr_iomap_begin( > error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, > &nimaps, XFS_BMAPI_ATTRFORK); > out_unlock: > + > + seq = READ_ONCE(ip->i_af.if_seq); > xfs_iunlock(ip, lockmode); > > if (error) > return error; > ASSERT(nimaps); > - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); > + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); > } > > const struct iomap_ops xfs_xattr_iomap_ops = { > diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h > index 0f62ab633040..792fed2a9072 100644 > --- a/fs/xfs/xfs_iomap.h > +++ b/fs/xfs/xfs_iomap.h > @@ -13,14 +13,14 @@ struct xfs_bmbt_irec; > > int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, > xfs_fileoff_t count_fsb, unsigned int flags, > - struct xfs_bmbt_irec *imap); > + struct xfs_bmbt_irec *imap, int *sequence); > int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); > xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, > xfs_fileoff_t end_fsb); > > int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, > struct xfs_bmbt_irec *imap, unsigned int mapping_flags, > - u16 iomap_flags); > + u16 iomap_flags, int sequence); > > int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, > bool *did_zero); > diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c > index 37a24f0f7cd4..eea507a80c5c 100644 > --- a/fs/xfs/xfs_pnfs.c > +++ b/fs/xfs/xfs_pnfs.c > @@ -125,6 +125,7 @@ xfs_fs_map_blocks( > int nimaps = 1; > uint lock_flags; > int error = 0; > + int seq; > > if (xfs_is_shutdown(mp)) > return -EIO; > @@ -189,7 +190,7 @@ xfs_fs_map_blocks( > xfs_iunlock(ip, lock_flags); > > error = xfs_iomap_write_direct(ip, offset_fsb, > - end_fsb - offset_fsb, 0, &imap); > + end_fsb - offset_fsb, 0, &imap, &seq); I got a compiler warning about seq not being set in the case where we don't call xfs_iomap_write_direct. --D > if (error) > goto out_unlock; > > @@ -209,7 +210,7 @@ xfs_fs_map_blocks( > } > xfs_iunlock(ip, XFS_IOLOCK_EXCL); > > - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); > + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); > *device_generation = mp->m_generation; > return error; > out_unlock: > -- > 2.37.2 >
On Wed, Nov 02, 2022 at 01:41:25AM -0700, Christoph Hellwig wrote: > > + *((int *)&iomap->private) = sequence; > > > +static bool > > +xfs_buffered_write_iomap_valid( > > + struct inode *inode, > > + const struct iomap *iomap) > > +{ > > + int seq = *((int *)&iomap->private); > > I really hate this stuffing of the sequence into the private pointer. Oh, I'm no fan of it either. It was this or work out how to support sequence numbers/cookies directly in iomaps... > The iomap structure isn't so size constrained that we have to do that, > so we can just add a sequence number field directly to it. I don't > think that is a layering violation, as the concept of a sequence > numebr is pretty generic and we'll probably need it for all file systems > eventually. *nod* This was the least of my worries trying to get this code to work. I didn't have to think about it this way, so it was one less thing to worry about. My concerns with putting it into the iomap is that different filesystems will have different mechanisms for detecting stale iomaps. THe way we do it with a generation counter is pretty coarse as any change to the extent map will invalidate the iomap regardless of whether they overlap or not. If, in future, we want something more complex and finer grained (e.g. an iext tree cursor) to allow us to determine if the change to the extent tree actually modified the extent backing the iomap, then we are going to need an opaque cookie of some kind, not a u32 or a u32*. > > + > > + if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq)) > > + return false; > > Which makes me wonder if we could do away with the callback entirely > by adding an option sequence number pointer to the iomap_iter. If set > the core code compares it against iomap->seq and we get rid of the > per-folio indirect call, and boilerplate code that would need to be > implemented in every file system. I'm not convinced that this is the right way to proceed. The writeback code also has cached iomap validity checks via a callback. The checks in xfs_imap_valid() require more than just a single sequence number check - there are two sequence numbers, one of which is conditionally checked when the inode may have COW operations occurring. Hence I don't think that encoding a single u32 into the iomap is generic enough even for the current "check the cached iomap is not stale" use cases we have. I'd really like to move towards a common mechanism for both the write and writeback paths. I didn't have the brain capacity to think all this through at the time, so I just stuffed the sequence number in the private field and moved on to the next part of the problem. Indeed, I haven't even checked if the read path might be susceptible to stale cached iomaps yet.... Cheers, Dave.
On Wed, Nov 02, 2022 at 10:19:33AM -0700, Darrick J. Wong wrote: > On Tue, Nov 01, 2022 at 11:34:11AM +1100, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > Now that iomap supports a mechanism to validate cached iomaps for > > buffered write operations, hook it up to the XFS buffered write ops > > so that we can avoid data corruptions that result from stale cached > > iomaps. See: > > > > https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ > > > > or the ->iomap_valid() introduction commit for exact details of the > > corruption vector. > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > --- > > fs/xfs/libxfs/xfs_bmap.c | 4 +-- > > fs/xfs/xfs_aops.c | 2 +- > > fs/xfs/xfs_iomap.c | 69 ++++++++++++++++++++++++++++++---------- > > fs/xfs/xfs_iomap.h | 4 +-- > > fs/xfs/xfs_pnfs.c | 5 +-- > > 5 files changed, 61 insertions(+), 23 deletions(-) > > > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c > > index 49d0d4ea63fc..db225130618c 100644 > > --- a/fs/xfs/libxfs/xfs_bmap.c > > +++ b/fs/xfs/libxfs/xfs_bmap.c > > @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc( > > * the extent. Just return the real extent at this offset. > > */ > > if (!isnullstartblock(bma.got.br_startblock)) { > > - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); > > *seq = READ_ONCE(ifp->if_seq); > > + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); > > goto out_trans_cancel; > > } > > > > @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc( > > XFS_STATS_INC(mp, xs_xstrat_quick); > > > > ASSERT(!isnullstartblock(bma.got.br_startblock)); > > - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); > > *seq = READ_ONCE(ifp->if_seq); > > + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); > > > > if (whichfork == XFS_COW_FORK) > > Hmm. @ifp here could be the cow fork, which means *seq will be from the > cow fork. This I think is going to cause problems in > xfs_buffered_write_iomap_valid... We can't get here from the buffered write path. This can only be reached by the writeback path via: iomap_do_writepage iomap_writepage_map xfs_map_blocks xfs_convert_blocks xfs_bmapi_convert_delalloc Hence the sequence numbers stored in iomap->private here are completely ignored by the writeback code. They still get stored in the struct xfs_writepage_ctx and checked by xfs_imap_valid(), so the changes here were really just making sure all the xfs_bmbt_to_iomap() callers stored the sequence number consistently. > > @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin( > > xfs_iunlock(ip, lockmode); > > > > error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, > > - flags, &imap); > > + flags, &imap, &seq); > > if (error) > > return error; > > > > trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); > > return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, > > - iomap_flags | IOMAP_F_NEW); > > + iomap_flags | IOMAP_F_NEW, seq); > > > > out_found_cow: > > + seq = READ_ONCE(ip->i_df.if_seq); > > xfs_iunlock(ip, lockmode); > > length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); > > trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); > > if (imap.br_startblock != HOLESTARTBLOCK) { > > - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); > > + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); > > if (error) > > return error; > > } > > - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); > > + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); > > Here we've found a mapping from the COW fork and we're encoding it into > the struct iomap. Why is the sequence number from the *data* fork and > not the COW fork? Because my brain isn't big enough to hold all this code straight in my head. I found it all too easy to get confused by what iomap is passed where and how to tell them apart and I could not tell if there was an easy way to tell if the iomap passed to xfs_iomap_valid() needed to check against the data or cow fork. Hence I set it up such that the xfs_iomap_valid() callback only checks the data fork sequence so the only correct thing to set in the iomap is the data fork sequence and moved on to the next part of the problem... But, again, this information probably should be encoded into the sequence cookie we store. Which begs the question - if we are doing a COW operation, we have to check that neither the data fork (the srcmap) nor the COW fork (the iter->iomap) have changed, right? This is what the writeback code does (i.e. checks both data and COW fork sequence numbers), so perhaps that's also what we should be doing here? i.e. either the cookie for COW operations needs to contain both data+cow sequence numbers, and both need to match to proceed, or we have to validate both the srcmap and the iter->iomap with separate callouts once the folio is locked. Thoughts? > > @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end( > > return 0; > > } > > > > +/* > > + * Check that the iomap passed to us is still valid for the given offset and > > + * length. > > + */ > > +static bool > > +xfs_buffered_write_iomap_valid( > > + struct inode *inode, > > + const struct iomap *iomap) > > +{ > > + int seq = *((int *)&iomap->private); > > + > > + if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq)) > > Why is it ok to sample the data fork's sequence number even if the > mapping came from the COW fork? That doesn't make sense to me, > conceptually. I definitely think it's buggy that the revalidation > might not sample from the same sequence counter as the original > measurement. Yup, see above. > > const struct iomap_ops xfs_read_iomap_ops = { > > @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin( > > end_fsb = min(end_fsb, data_fsb); > > xfs_trim_extent(&cmap, offset_fsb, end_fsb); > > error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, > > - IOMAP_F_SHARED); > > + IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq)); > > Here we explicitly sample from the cow fork but the comparison is done > against the data fork. That /probably/ doesn't matter for reads since > you're not proposing that we revalidate on those. Should we? As I said elsewhere, I haven't even looked at the read path. The iomap code is now complex enough that I have trouble following it and keeping all the corner cases in my head. This is the down side of having people smarter than you write the code you need to debug when shit inevitably goes wrong. > What > happens if userspace hands us a large read() from an unwritten extent > and the same dirty/writeback/reclaim sequence happens to the last folio > in that read() range before read_iter actually gets there? No idea - shit may well go wrong there in the same way, but I've had my hands full just fixing the write path. I'm *really* tired of fighting with the iomap code right now... > > @@ -189,7 +190,7 @@ xfs_fs_map_blocks( > > xfs_iunlock(ip, lock_flags); > > > > error = xfs_iomap_write_direct(ip, offset_fsb, > > - end_fsb - offset_fsb, 0, &imap); > > + end_fsb - offset_fsb, 0, &imap, &seq); > > I got a compiler warning about seq not being set in the case where we > don't call xfs_iomap_write_direct. Yup, gcc 11.2 doesn't warn about it. Now to find out why my build system is using gcc 11.2 when I upgraded it months ago to use gcc-12 because of exactly these sorts of issues.... :( -Dave.
On Thu, Nov 03, 2022 at 08:39:27AM +1100, Dave Chinner wrote: > My concerns with putting it into the iomap is that different > filesystems will have different mechanisms for detecting stale > iomaps. THe way we do it with a generation counter is pretty coarse > as any change to the extent map will invalidate the iomap regardless > of whether they overlap or not. OTOH it is a good way to nudge users to at least implement this simple but working scheme, and it has no real overhead if people later want to do something more fancy. > If, in future, we want something more complex and finer grained > (e.g. an iext tree cursor) to allow us to determine if the > change to the extent tree actually modified the extent backing the > iomap, then we are going to need an opaque cookie of some kind, not > a u32 or a u32*. Yes, but for that it actually has to be worth it and be implemented.
On Thu, Nov 03, 2022 at 09:36:05AM +1100, Dave Chinner wrote: > On Wed, Nov 02, 2022 at 10:19:33AM -0700, Darrick J. Wong wrote: > > On Tue, Nov 01, 2022 at 11:34:11AM +1100, Dave Chinner wrote: > > > From: Dave Chinner <dchinner@redhat.com> > > > > > > Now that iomap supports a mechanism to validate cached iomaps for > > > buffered write operations, hook it up to the XFS buffered write ops > > > so that we can avoid data corruptions that result from stale cached > > > iomaps. See: > > > > > > https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ > > > > > > or the ->iomap_valid() introduction commit for exact details of the > > > corruption vector. > > > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > > --- > > > fs/xfs/libxfs/xfs_bmap.c | 4 +-- > > > fs/xfs/xfs_aops.c | 2 +- > > > fs/xfs/xfs_iomap.c | 69 ++++++++++++++++++++++++++++++---------- > > > fs/xfs/xfs_iomap.h | 4 +-- > > > fs/xfs/xfs_pnfs.c | 5 +-- > > > 5 files changed, 61 insertions(+), 23 deletions(-) > > > > > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c > > > index 49d0d4ea63fc..db225130618c 100644 > > > --- a/fs/xfs/libxfs/xfs_bmap.c > > > +++ b/fs/xfs/libxfs/xfs_bmap.c > > > @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc( > > > * the extent. Just return the real extent at this offset. > > > */ > > > if (!isnullstartblock(bma.got.br_startblock)) { > > > - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); > > > *seq = READ_ONCE(ifp->if_seq); > > > + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); > > > goto out_trans_cancel; > > > } > > > > > > @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc( > > > XFS_STATS_INC(mp, xs_xstrat_quick); > > > > > > ASSERT(!isnullstartblock(bma.got.br_startblock)); > > > - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); > > > *seq = READ_ONCE(ifp->if_seq); > > > + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); > > > > > > if (whichfork == XFS_COW_FORK) > > > > Hmm. @ifp here could be the cow fork, which means *seq will be from the > > cow fork. This I think is going to cause problems in > > xfs_buffered_write_iomap_valid... > > We can't get here from the buffered write path. This can only be > reached by the writeback path via: > > iomap_do_writepage > iomap_writepage_map > xfs_map_blocks > xfs_convert_blocks > xfs_bmapi_convert_delalloc > > Hence the sequence numbers stored in iomap->private here are > completely ignored by the writeback code. They still get stored > in the struct xfs_writepage_ctx and checked by xfs_imap_valid(), so > the changes here were really just making sure all the > xfs_bmbt_to_iomap() callers stored the sequence number consistently. > > > > @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin( > > > xfs_iunlock(ip, lockmode); > > > > > > error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, > > > - flags, &imap); > > > + flags, &imap, &seq); > > > if (error) > > > return error; > > > > > > trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); > > > return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, > > > - iomap_flags | IOMAP_F_NEW); > > > + iomap_flags | IOMAP_F_NEW, seq); > > > > > > out_found_cow: > > > + seq = READ_ONCE(ip->i_df.if_seq); > > > xfs_iunlock(ip, lockmode); > > > length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); > > > trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); > > > if (imap.br_startblock != HOLESTARTBLOCK) { > > > - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); > > > + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); > > > if (error) > > > return error; > > > } > > > - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); > > > + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); > > > > Here we've found a mapping from the COW fork and we're encoding it into > > the struct iomap. Why is the sequence number from the *data* fork and > > not the COW fork? > > Because my brain isn't big enough to hold all this code straight in > my head. I found it all too easy to get confused by what iomap is > passed where and how to tell them apart and I could not tell if > there was an easy way to tell if the iomap passed to > xfs_iomap_valid() needed to check against the data or cow fork. > > Hence I set it up such that the xfs_iomap_valid() callback only > checks the data fork sequence so the only correct thing to set in > the iomap is the data fork sequence and moved on to the next part of > the problem... > > But, again, this information probably should be encoded into the > sequence cookie we store. Which begs the question - if we are doing > a COW operation, we have to check that neither the data fork (the > srcmap) nor the COW fork (the iter->iomap) have changed, right? This > is what the writeback code does (i.e. checks both data and COW fork > sequence numbers), so perhaps that's also what we should be doing > here? > > i.e. either the cookie for COW operations needs to contain both > data+cow sequence numbers, and both need to match to proceed, or we > have to validate both the srcmap and the iter->iomap with separate > callouts once the folio is locked. > > Thoughts? As we sorta discussed last week on IRC, I guess you could start by encoding both values (as needed) in something fugly like this in struct iomap: union { void *private; u64 cookie; }; But the longer term solution is (I suspect) to make a new struct that XFS can envelop: struct iomap_pagecache_ctx { struct iomap_iter iter; /* do not touch */ struct iomap_page_ops *ops; /* callers can attach here */ }; struct xfs_pagecache_ctx { struct iomap_pagecache_ctx xpctx; u32 data_seq; u32 cow_seq; }; and pass that into the iomap functions: struct xfs_pagecache_ctx ctx = { }; ret = iomap_file_buffered_write(iocb, from, &ctx, &xfs_buffered_write_iomap_ops); so that xfs_iomap_revalidate can do the usual struct unwrapping: struct xfs_pagecache_ctx *xpctx; xpctx = container_of(ipctx, struct xfs_pagecache_ctx, ipctx); if (xpctx->data_seq != ip->i_df.df_seq) return NOPE; But that's a pretty aggressive refactoring, not suitable for a bugfix, do it afterwards while you're waiting for the rest of us to check over part 1, etc. --D > > > @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end( > > > return 0; > > > } > > > > > > +/* > > > + * Check that the iomap passed to us is still valid for the given offset and > > > + * length. > > > + */ > > > +static bool > > > +xfs_buffered_write_iomap_valid( > > > + struct inode *inode, > > > + const struct iomap *iomap) > > > +{ > > > + int seq = *((int *)&iomap->private); > > > + > > > + if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq)) > > > > Why is it ok to sample the data fork's sequence number even if the > > mapping came from the COW fork? That doesn't make sense to me, > > conceptually. I definitely think it's buggy that the revalidation > > might not sample from the same sequence counter as the original > > measurement. > > Yup, see above. > > > > const struct iomap_ops xfs_read_iomap_ops = { > > > @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin( > > > end_fsb = min(end_fsb, data_fsb); > > > xfs_trim_extent(&cmap, offset_fsb, end_fsb); > > > error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, > > > - IOMAP_F_SHARED); > > > + IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq)); > > > > Here we explicitly sample from the cow fork but the comparison is done > > against the data fork. That /probably/ doesn't matter for reads since > > you're not proposing that we revalidate on those. Should we? > > As I said elsewhere, I haven't even looked at the read path. The > iomap code is now complex enough that I have trouble following it > and keeping all the corner cases in my head. This is the down side > of having people smarter than you write the code you need to debug > when shit inevitably goes wrong. > > > What > > happens if userspace hands us a large read() from an unwritten extent > > and the same dirty/writeback/reclaim sequence happens to the last folio > > in that read() range before read_iter actually gets there? > > No idea - shit may well go wrong there in the same way, but I've had > my hands full just fixing the write path. I'm *really* tired of > fighting with the iomap code right now... > > > > @@ -189,7 +190,7 @@ xfs_fs_map_blocks( > > > xfs_iunlock(ip, lock_flags); > > > > > > error = xfs_iomap_write_direct(ip, offset_fsb, > > > - end_fsb - offset_fsb, 0, &imap); > > > + end_fsb - offset_fsb, 0, &imap, &seq); > > > > I got a compiler warning about seq not being set in the case where we > > don't call xfs_iomap_write_direct. > > Yup, gcc 11.2 doesn't warn about it. Now to find out why my build > system is using gcc 11.2 when I upgraded it months ago to use > gcc-12 because of exactly these sorts of issues.... :( --D > -Dave. > -- > Dave Chinner > david@fromorbit.com
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 49d0d4ea63fc..db225130618c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4551,8 +4551,8 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); goto out_trans_cancel; } @@ -4599,8 +4599,8 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, *seq); if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5d1a995b15f8..ca5a9e45a48c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -373,7 +373,7 @@ xfs_map_blocks( isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2d48fcc7bd6f..5053ffcf10fe 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -54,7 +54,8 @@ xfs_bmbt_to_iomap( struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags) + u16 iomap_flags, + int sequence) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -91,6 +92,9 @@ xfs_bmbt_to_iomap( if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; + + /* The extent tree sequence is needed for iomap validity checking. */ + *((int *)&iomap->private) = sequence; return 0; } @@ -195,7 +199,8 @@ xfs_iomap_write_direct( xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + int *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -285,6 +290,8 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + if (seq) + *seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -743,6 +750,7 @@ xfs_direct_write_iomap_begin( bool shared = false; u16 iomap_flags = 0; unsigned int lockmode = XFS_ILOCK_SHARED; + int seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -811,9 +819,10 @@ xfs_direct_write_iomap_begin( goto out_unlock; } + seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); allocate_blocks: error = -EAGAIN; @@ -839,24 +848,25 @@ xfs_direct_write_iomap_begin( xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - flags, &imap); + flags, &imap, &seq); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - iomap_flags | IOMAP_F_NEW); + iomap_flags | IOMAP_F_NEW, seq); out_found_cow: + seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) return error; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); out_unlock: if (lockmode) @@ -915,6 +925,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1094,26 +1105,29 @@ xfs_buffered_write_iomap_begin( * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ + seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: + seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: + seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) return error; return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1328,9 +1342,26 @@ xfs_buffered_write_iomap_end( return 0; } +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_buffered_write_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + int seq = *((int *)&iomap->private); + + if (seq != READ_ONCE(XFS_I(inode)->i_df.if_seq)) + return false; + return true; +} + const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_begin = xfs_buffered_write_iomap_begin, .iomap_end = xfs_buffered_write_iomap_end, + .iomap_valid = xfs_buffered_write_iomap_valid, }; /* @@ -1359,6 +1390,7 @@ xfs_read_iomap_begin( int nimaps = 1, error = 0; bool shared = false; unsigned int lockmode = XFS_ILOCK_SHARED; + int seq; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1372,13 +1404,14 @@ xfs_read_iomap_begin( &nimaps, 0); if (!error && (flags & IOMAP_REPORT)) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, lockmode); if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - shared ? IOMAP_F_SHARED : 0); + shared ? IOMAP_F_SHARED : 0, seq); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1438,7 +1471,7 @@ xfs_seek_iomap_begin( end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, READ_ONCE(ip->i_cowfp->if_seq)); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1460,7 +1493,8 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, + READ_ONCE(ip->i_df.if_seq)); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1486,6 +1520,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1502,12 +1537,14 @@ xfs_xattr_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: + + seq = READ_ONCE(ip->i_af.if_seq); xfs_iunlock(ip, lockmode); if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 0f62ab633040..792fed2a9072 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,14 +13,14 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap); + struct xfs_bmbt_irec *imap, int *sequence); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags); + u16 iomap_flags, int sequence); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, bool *did_zero); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 37a24f0f7cd4..eea507a80c5c 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -125,6 +125,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -189,7 +190,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, 0, &imap); + end_fsb - offset_fsb, 0, &imap, &seq); if (error) goto out_unlock; @@ -209,7 +210,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); *device_generation = mp->m_generation; return error; out_unlock: