diff mbox series

[PATCHv7,3/6] iomap: Refactor some iop related accessor functions

Message ID 4fe4937718d44c89e0c279175c65921717d9f591.1685962158.git.ritesh.list@gmail.com (mailing list archive)
State Under Review
Headers show
Series iomap: Add support for per-block dirty state to improve write performance | expand

Commit Message

Ritesh Harjani (IBM) June 5, 2023, 10:55 a.m. UTC
We would eventually use iomap_iop_** function naming by the rest of the
buffered-io iomap code. This patch update function arguments and naming
from iomap_set_range_uptodate() -> iomap_iop_set_range_uptodate().
iop_set_range_uptodate() then becomes an accessor function used by
iomap_iop_** functions.

Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/iomap/buffered-io.c | 111 +++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 48 deletions(-)

Comments

Andreas Gruenbacher June 5, 2023, 2:15 p.m. UTC | #1
On Mon, Jun 5, 2023 at 12:55 PM Ritesh Harjani (IBM)
<ritesh.list@gmail.com> wrote:
> We would eventually use iomap_iop_** function naming by the rest of the
> buffered-io iomap code. This patch update function arguments and naming
> from iomap_set_range_uptodate() -> iomap_iop_set_range_uptodate().
> iop_set_range_uptodate() then becomes an accessor function used by
> iomap_iop_** functions.
>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/iomap/buffered-io.c | 111 +++++++++++++++++++++++------------------
>  1 file changed, 63 insertions(+), 48 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 6fffda355c45..136f57ccd0be 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -24,14 +24,14 @@
>  #define IOEND_BATCH_SIZE       4096
>
>  /*
> - * Structure allocated for each folio when block size < folio size
> - * to track sub-folio uptodate status and I/O completions.
> + * Structure allocated for each folio to track per-block uptodate state
> + * and I/O completions.
>   */
>  struct iomap_page {
>         atomic_t                read_bytes_pending;
>         atomic_t                write_bytes_pending;
> -       spinlock_t              uptodate_lock;
> -       unsigned long           uptodate[];
> +       spinlock_t              state_lock;
> +       unsigned long           state[];
>  };
>
>  static inline struct iomap_page *to_iomap_page(struct folio *folio)
> @@ -43,6 +43,48 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
>
>  static struct bio_set iomap_ioend_bioset;
>
> +static bool iop_test_full_uptodate(struct folio *folio)
> +{
> +       struct iomap_page *iop = to_iomap_page(folio);
> +       struct inode *inode = folio->mapping->host;
> +
> +       return bitmap_full(iop->state, i_blocks_per_folio(inode, folio));
> +}

Can this be called iop_test_fully_uptodate(), please?

> +
> +static bool iop_test_block_uptodate(struct folio *folio, unsigned int block)
> +{
> +       struct iomap_page *iop = to_iomap_page(folio);
> +
> +       return test_bit(block, iop->state);
> +}
> +
> +static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
> +                                  size_t off, size_t len)
> +{
> +       struct iomap_page *iop = to_iomap_page(folio);

Note that to_iomap_page() does folio_test_private() followed by
folio_get_private(), which doesn't really make sense in places where
we know that iop is defined. Maybe we want to split that up.

> +       unsigned int first_blk = off >> inode->i_blkbits;
> +       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +       unsigned int nr_blks = last_blk - first_blk + 1;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&iop->state_lock, flags);
> +       bitmap_set(iop->state, first_blk, nr_blks);
> +       if (iop_test_full_uptodate(folio))
> +               folio_mark_uptodate(folio);
> +       spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_iop_set_range_uptodate(struct inode *inode,
> +               struct folio *folio, size_t off, size_t len)
> +{
> +       struct iomap_page *iop = to_iomap_page(folio);
> +
> +       if (iop)
> +               iop_set_range_uptodate(inode, folio, off, len);
> +       else
> +               folio_mark_uptodate(folio);
> +}

This patch passes the inode into iomap_iop_set_range_uptodate() and
removes the iop argument. Can this be done in a separate patch,
please?

We have a few places like the above, where we look up the iop without
using it. Would a function like folio_has_iop() make more sense?

> +
>  static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>                                 struct folio *folio, unsigned int flags)
>  {
> @@ -58,12 +100,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>         else
>                 gfp = GFP_NOFS | __GFP_NOFAIL;
>
> -       iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
> +       iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
>                       gfp);
>         if (iop) {
> -               spin_lock_init(&iop->uptodate_lock);
> +               spin_lock_init(&iop->state_lock);
>                 if (folio_test_uptodate(folio))
> -                       bitmap_fill(iop->uptodate, nr_blocks);
> +                       bitmap_fill(iop->state, nr_blocks);
>                 folio_attach_private(folio, iop);
>         }
>         return iop;
> @@ -72,14 +114,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>  static void iomap_iop_free(struct folio *folio)
>  {
>         struct iomap_page *iop = to_iomap_page(folio);
> -       struct inode *inode = folio->mapping->host;
> -       unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
>
>         if (!iop)
>                 return;
>         WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
>         WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
> -       WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> +       WARN_ON_ONCE(iop_test_full_uptodate(folio) !=
>                         folio_test_uptodate(folio));
>         folio_detach_private(folio);
>         kfree(iop);
> @@ -111,7 +151,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>
>                 /* move forward for each leading block marked uptodate */
>                 for (i = first; i <= last; i++) {
> -                       if (!test_bit(i, iop->uptodate))
> +                       if (!iop_test_block_uptodate(folio, i))
>                                 break;
>                         *pos += block_size;
>                         poff += block_size;
> @@ -121,7 +161,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>
>                 /* truncate len if we find any trailing uptodate block(s) */
>                 for ( ; i <= last; i++) {
> -                       if (test_bit(i, iop->uptodate)) {
> +                       if (iop_test_block_uptodate(folio, i)) {
>                                 plen -= (last - i + 1) * block_size;
>                                 last = i - 1;
>                                 break;
> @@ -145,30 +185,6 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>         *lenp = plen;
>  }
>
> -static void iomap_iop_set_range_uptodate(struct folio *folio,
> -               struct iomap_page *iop, size_t off, size_t len)
> -{
> -       struct inode *inode = folio->mapping->host;
> -       unsigned first = off >> inode->i_blkbits;
> -       unsigned last = (off + len - 1) >> inode->i_blkbits;
> -       unsigned long flags;
> -
> -       spin_lock_irqsave(&iop->uptodate_lock, flags);
> -       bitmap_set(iop->uptodate, first, last - first + 1);
> -       if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
> -               folio_mark_uptodate(folio);
> -       spin_unlock_irqrestore(&iop->uptodate_lock, flags);
> -}
> -
> -static void iomap_set_range_uptodate(struct folio *folio,
> -               struct iomap_page *iop, size_t off, size_t len)
> -{
> -       if (iop)
> -               iomap_iop_set_range_uptodate(folio, iop, off, len);
> -       else
> -               folio_mark_uptodate(folio);
> -}
> -
>  static void iomap_finish_folio_read(struct folio *folio, size_t offset,
>                 size_t len, int error)
>  {
> @@ -178,7 +194,8 @@ static void iomap_finish_folio_read(struct folio *folio, size_t offset,
>                 folio_clear_uptodate(folio);
>                 folio_set_error(folio);
>         } else {
> -               iomap_set_range_uptodate(folio, iop, offset, len);
> +               iomap_iop_set_range_uptodate(folio->mapping->host, folio,
> +                                            offset, len);
>         }
>
>         if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
> @@ -214,7 +231,6 @@ struct iomap_readpage_ctx {
>  static int iomap_read_inline_data(const struct iomap_iter *iter,
>                 struct folio *folio)
>  {
> -       struct iomap_page *iop;
>         const struct iomap *iomap = iomap_iter_srcmap(iter);
>         size_t size = i_size_read(iter->inode) - iomap->offset;
>         size_t poff = offset_in_page(iomap->offset);
> @@ -232,15 +248,14 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
>         if (WARN_ON_ONCE(size > iomap->length))
>                 return -EIO;
>         if (offset > 0)
> -               iop = iomap_iop_alloc(iter->inode, folio, iter->flags);
> -       else
> -               iop = to_iomap_page(folio);
> +               iomap_iop_alloc(iter->inode, folio, iter->flags);
>
>         addr = kmap_local_folio(folio, offset);
>         memcpy(addr, iomap->inline_data, size);
>         memset(addr + size, 0, PAGE_SIZE - poff - size);
>         kunmap_local(addr);
> -       iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
> +       iomap_iop_set_range_uptodate(iter->inode, folio, offset,
> +                                    PAGE_SIZE - poff);
>         return 0;
>  }
>
> @@ -277,7 +292,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>
>         if (iomap_block_needs_zeroing(iter, pos)) {
>                 folio_zero_range(folio, poff, plen);
> -               iomap_set_range_uptodate(folio, iop, poff, plen);
> +               iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
>                 goto done;
>         }
>
> @@ -452,7 +467,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
>         last = (from + count - 1) >> inode->i_blkbits;
>
>         for (i = first; i <= last; i++)
> -               if (!test_bit(i, iop->uptodate))
> +               if (!iop_test_block_uptodate(folio, i))
>                         return false;
>         return true;
>  }
> @@ -591,7 +606,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>                         if (status)
>                                 return status;
>                 }
> -               iomap_set_range_uptodate(folio, iop, poff, plen);
> +               iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
>         } while ((block_start += plen) < block_end);
>
>         return 0;
> @@ -698,7 +713,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
>  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>                 size_t copied, struct folio *folio)
>  {
> -       struct iomap_page *iop = to_iomap_page(folio);
>         flush_dcache_folio(folio);
>
>         /*
> @@ -714,7 +728,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>          */
>         if (unlikely(copied < len && !folio_test_uptodate(folio)))
>                 return 0;
> -       iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
> +       iomap_iop_set_range_uptodate(inode, folio, offset_in_folio(folio, pos),
> +                                    len);
>         filemap_dirty_folio(inode->i_mapping, folio);
>         return copied;
>  }
> @@ -1630,7 +1645,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>          * invalid, grab a new one.
>          */
>         for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -               if (iop && !test_bit(i, iop->uptodate))
> +               if (iop && !iop_test_block_uptodate(folio, i))
>                         continue;
>
>                 error = wpc->ops->map_blocks(wpc, inode, pos);
> --
> 2.40.1
>

Thanks,
Andreas
Matthew Wilcox June 5, 2023, 2:31 p.m. UTC | #2
On Mon, Jun 05, 2023 at 04:15:31PM +0200, Andreas Gruenbacher wrote:
> Note that to_iomap_page() does folio_test_private() followed by
> folio_get_private(), which doesn't really make sense in places where
> we know that iop is defined. Maybe we want to split that up.

The plan is to retire the folio private flag entirely.  I just haven't
got round to cleaning up iomap yet.  For folios which we know to be
file-backed, we can just test whether folio->private is NULL or not.

So I'd do this as:

-	struct iomap_page *iop = to_iomap_page(folio);
+	struct iomap_page *iop = folio->private;

and not even use folio_get_private() or to_iomap_page() any more.

> > +       unsigned int first_blk = off >> inode->i_blkbits;
> > +       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> > +       unsigned int nr_blks = last_blk - first_blk + 1;
> > +       unsigned long flags;
> > +
> > +       spin_lock_irqsave(&iop->state_lock, flags);
> > +       bitmap_set(iop->state, first_blk, nr_blks);
> > +       if (iop_test_full_uptodate(folio))
> > +               folio_mark_uptodate(folio);
> > +       spin_unlock_irqrestore(&iop->state_lock, flags);
> > +}
> > +
> > +static void iomap_iop_set_range_uptodate(struct inode *inode,
> > +               struct folio *folio, size_t off, size_t len)
> > +{
> > +       struct iomap_page *iop = to_iomap_page(folio);
> > +
> > +       if (iop)
> > +               iop_set_range_uptodate(inode, folio, off, len);
> > +       else
> > +               folio_mark_uptodate(folio);
> > +}
> 
> This patch passes the inode into iomap_iop_set_range_uptodate() and
> removes the iop argument. Can this be done in a separate patch,
> please?
> 
> We have a few places like the above, where we look up the iop without
> using it. Would a function like folio_has_iop() make more sense?

I think this is all a symptom of the unnecessary splitting of
iomap_iop_set_range_uptodate and iop_set_range_uptodate.
Ritesh Harjani (IBM) June 5, 2023, 8:48 p.m. UTC | #3
Matthew Wilcox <willy@infradead.org> writes:

> On Mon, Jun 05, 2023 at 04:15:31PM +0200, Andreas Gruenbacher wrote:
>> Note that to_iomap_page() does folio_test_private() followed by
>> folio_get_private(), which doesn't really make sense in places where
>> we know that iop is defined. Maybe we want to split that up.
>
> The plan is to retire the folio private flag entirely.  I just haven't
> got round to cleaning up iomap yet.  For folios which we know to be
> file-backed, we can just test whether folio->private is NULL or not.
>
> So I'd do this as:
>
> -	struct iomap_page *iop = to_iomap_page(folio);
> +	struct iomap_page *iop = folio->private;
>
> and not even use folio_get_private() or to_iomap_page() any more.
>

In that case, shouldn't we just modify to_iomap_page(folio) function
to just return folio_get_private(folio) or folio->private ?

>> > +       unsigned int first_blk = off >> inode->i_blkbits;
>> > +       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> > +       unsigned int nr_blks = last_blk - first_blk + 1;
>> > +       unsigned long flags;
>> > +
>> > +       spin_lock_irqsave(&iop->state_lock, flags);
>> > +       bitmap_set(iop->state, first_blk, nr_blks);
>> > +       if (iop_test_full_uptodate(folio))
>> > +               folio_mark_uptodate(folio);
>> > +       spin_unlock_irqrestore(&iop->state_lock, flags);
>> > +}
>> > +
>> > +static void iomap_iop_set_range_uptodate(struct inode *inode,
>> > +               struct folio *folio, size_t off, size_t len)
>> > +{
>> > +       struct iomap_page *iop = to_iomap_page(folio);
>> > +
>> > +       if (iop)
>> > +               iop_set_range_uptodate(inode, folio, off, len);
>> > +       else
>> > +               folio_mark_uptodate(folio);
>> > +}
>>
>> This patch passes the inode into iomap_iop_set_range_uptodate() and
>> removes the iop argument. Can this be done in a separate patch,
>> please?
>>
>> We have a few places like the above, where we look up the iop without
>> using it. Would a function like folio_has_iop() make more sense?
>
> I think this is all a symptom of the unnecessary splitting of
> iomap_iop_set_range_uptodate and iop_set_range_uptodate.

Thanks for the review. The problem in not splitting this would be a lot
of variable initialization for !iop case as well.
Hence in one of the previous versions it was discussed that splitting it
into a helper routine for iop case would be a better approach.

-ritesh
Ritesh Harjani (IBM) June 5, 2023, 9 p.m. UTC | #4
Andreas Gruenbacher <agruenba@redhat.com> writes:

> On Mon, Jun 5, 2023 at 12:55 PM Ritesh Harjani (IBM)
> <ritesh.list@gmail.com> wrote:
>> We would eventually use iomap_iop_** function naming by the rest of the
>> buffered-io iomap code. This patch update function arguments and naming
>> from iomap_set_range_uptodate() -> iomap_iop_set_range_uptodate().
>> iop_set_range_uptodate() then becomes an accessor function used by
>> iomap_iop_** functions.
>>
>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>> ---
>>  fs/iomap/buffered-io.c | 111 +++++++++++++++++++++++------------------
>>  1 file changed, 63 insertions(+), 48 deletions(-)
>>
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index 6fffda355c45..136f57ccd0be 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -24,14 +24,14 @@
>>  #define IOEND_BATCH_SIZE       4096
>>
>>  /*
>> - * Structure allocated for each folio when block size < folio size
>> - * to track sub-folio uptodate status and I/O completions.
>> + * Structure allocated for each folio to track per-block uptodate state
>> + * and I/O completions.
>>   */
>>  struct iomap_page {
>>         atomic_t                read_bytes_pending;
>>         atomic_t                write_bytes_pending;
>> -       spinlock_t              uptodate_lock;
>> -       unsigned long           uptodate[];
>> +       spinlock_t              state_lock;
>> +       unsigned long           state[];
>>  };
>>
>>  static inline struct iomap_page *to_iomap_page(struct folio *folio)
>> @@ -43,6 +43,48 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
>>
>>  static struct bio_set iomap_ioend_bioset;
>>
>> +static bool iop_test_full_uptodate(struct folio *folio)
>> +{
>> +       struct iomap_page *iop = to_iomap_page(folio);
>> +       struct inode *inode = folio->mapping->host;
>> +
>> +       return bitmap_full(iop->state, i_blocks_per_folio(inode, folio));
>> +}
>
> Can this be called iop_test_fully_uptodate(), please?
>

IMHO, iop_test_full_uptodate() looks fine. It goes similar to
bitmap_full() function.

>> +
>> +static bool iop_test_block_uptodate(struct folio *folio, unsigned int block)
>> +{
>> +       struct iomap_page *iop = to_iomap_page(folio);
>> +
>> +       return test_bit(block, iop->state);
>> +}
>> +
>> +static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
>> +                                  size_t off, size_t len)
>> +{
>> +       struct iomap_page *iop = to_iomap_page(folio);
>
> Note that to_iomap_page() does folio_test_private() followed by
> folio_get_private(), which doesn't really make sense in places where
> we know that iop is defined. Maybe we want to split that up.
>

I think in mm we have PG_Private flag which gets set as a pageflag.
So folio_test_private() actually checks whether we have PG_Private flag
set or not ( I guess it could be to overload folio->private use).

For file folio, maybe can we directly return folio_get_private(folio)
from to_iomap_page(folio) ?

>> +       unsigned int first_blk = off >> inode->i_blkbits;
>> +       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> +       unsigned int nr_blks = last_blk - first_blk + 1;
>> +       unsigned long flags;
>> +
>> +       spin_lock_irqsave(&iop->state_lock, flags);
>> +       bitmap_set(iop->state, first_blk, nr_blks);
>> +       if (iop_test_full_uptodate(folio))
>> +               folio_mark_uptodate(folio);
>> +       spin_unlock_irqrestore(&iop->state_lock, flags);
>> +}
>> +
>> +static void iomap_iop_set_range_uptodate(struct inode *inode,
>> +               struct folio *folio, size_t off, size_t len)
>> +{
>> +       struct iomap_page *iop = to_iomap_page(folio);
>> +
>> +       if (iop)
>> +               iop_set_range_uptodate(inode, folio, off, len);
>> +       else
>> +               folio_mark_uptodate(folio);
>> +}
>
> This patch passes the inode into iomap_iop_set_range_uptodate() and
> removes the iop argument. Can this be done in a separate patch,
> please?
>
> We have a few places like the above, where we look up the iop without
> using it. Would a function like folio_has_iop() make more sense?
>

Just realized that we should also rename
to_iomap_page(folio) -> iomap_get_iop(folio).

For your comment, we can use that as -

+static void iomap_iop_set_range_uptodate(struct inode *inode,
+               struct folio *folio, size_t off, size_t len)
+{
+       if (iomap_get_iop(folio))
+               iop_set_range_uptodate(inode, folio, off, len);
+       else
+               folio_mark_uptodate(folio);
+}


Does this looks ok?

-ritesh
Andreas Grünbacher June 5, 2023, 9:50 p.m. UTC | #5
Am Mo., 5. Juni 2023 um 23:05 Uhr schrieb Ritesh Harjani
<ritesh.list@gmail.com>:
> Andreas Gruenbacher <agruenba@redhat.com> writes:
> > On Mon, Jun 5, 2023 at 12:55 PM Ritesh Harjani (IBM) <ritesh.list@gmail.com> wrote:
> >> We would eventually use iomap_iop_** function naming by the rest of the
> >> buffered-io iomap code. This patch update function arguments and naming
> >> from iomap_set_range_uptodate() -> iomap_iop_set_range_uptodate().
> >> iop_set_range_uptodate() then becomes an accessor function used by
> >> iomap_iop_** functions.
> >>
> >> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> >> ---
> >>  fs/iomap/buffered-io.c | 111 +++++++++++++++++++++++------------------
> >>  1 file changed, 63 insertions(+), 48 deletions(-)
> >>
> >> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> >> index 6fffda355c45..136f57ccd0be 100644
> >> --- a/fs/iomap/buffered-io.c
> >> +++ b/fs/iomap/buffered-io.c
> >> @@ -24,14 +24,14 @@
> >>  #define IOEND_BATCH_SIZE       4096
> >>
> >>  /*
> >> - * Structure allocated for each folio when block size < folio size
> >> - * to track sub-folio uptodate status and I/O completions.
> >> + * Structure allocated for each folio to track per-block uptodate state
> >> + * and I/O completions.
> >>   */
> >>  struct iomap_page {
> >>         atomic_t                read_bytes_pending;
> >>         atomic_t                write_bytes_pending;
> >> -       spinlock_t              uptodate_lock;
> >> -       unsigned long           uptodate[];
> >> +       spinlock_t              state_lock;
> >> +       unsigned long           state[];
> >>  };
> >>
> >>  static inline struct iomap_page *to_iomap_page(struct folio *folio)
> >> @@ -43,6 +43,48 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
> >>
> >>  static struct bio_set iomap_ioend_bioset;
> >>
> >> +static bool iop_test_full_uptodate(struct folio *folio)
> >> +{
> >> +       struct iomap_page *iop = to_iomap_page(folio);
> >> +       struct inode *inode = folio->mapping->host;
> >> +
> >> +       return bitmap_full(iop->state, i_blocks_per_folio(inode, folio));
> >> +}
> >
> > Can this be called iop_test_fully_uptodate(), please?
> >
>
> IMHO, iop_test_full_uptodate() looks fine. It goes similar to
> bitmap_full() function.

Nah, it really isn't fine, it's "the bitmap is full" vs. "the iop is
fully uptodate"; you can't say "the iop is full uptodate".

Thanks,
Andreas
Darrick J. Wong June 5, 2023, 10:54 p.m. UTC | #6
On Mon, Jun 05, 2023 at 04:25:03PM +0530, Ritesh Harjani (IBM) wrote:
> We would eventually use iomap_iop_** function naming by the rest of the
> buffered-io iomap code. This patch update function arguments and naming
> from iomap_set_range_uptodate() -> iomap_iop_set_range_uptodate().
> iop_set_range_uptodate() then becomes an accessor function used by
> iomap_iop_** functions.
> 
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/iomap/buffered-io.c | 111 +++++++++++++++++++++++------------------
>  1 file changed, 63 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 6fffda355c45..136f57ccd0be 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -24,14 +24,14 @@
>  #define IOEND_BATCH_SIZE	4096
>  
>  /*
> - * Structure allocated for each folio when block size < folio size
> - * to track sub-folio uptodate status and I/O completions.
> + * Structure allocated for each folio to track per-block uptodate state
> + * and I/O completions.
>   */
>  struct iomap_page {
>  	atomic_t		read_bytes_pending;
>  	atomic_t		write_bytes_pending;
> -	spinlock_t		uptodate_lock;
> -	unsigned long		uptodate[];
> +	spinlock_t		state_lock;
> +	unsigned long		state[];
>  };
>  
>  static inline struct iomap_page *to_iomap_page(struct folio *folio)
> @@ -43,6 +43,48 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
>  
>  static struct bio_set iomap_ioend_bioset;
>  
> +static bool iop_test_full_uptodate(struct folio *folio)

Same comment as Andreas, I think this works better with 'fully', e.g.

iop_test_fully_uptodate()

Why you don't pass the iomap_page directly into this function?  Doesn't
that eliminate the need for iomap_iop_free to keep folio->private set
until the very end?

static inline bool
iomap_iop_is_fully_uptodate(const struct iomap_page *iop,
			    const struct folio *folio)

Same sort of thing for the second function -- we already extracted
folio->private and checked it wasn't null, so we don't need to do that
again.

static inline bool
iomap_io_is_block_uptodate(const struct iomap_page *iop,
			   const struct folio *folio,
			   unsigned int block)

> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +	struct inode *inode = folio->mapping->host;
> +
> +	return bitmap_full(iop->state, i_blocks_per_folio(inode, folio));
> +}
> +
> +static bool iop_test_block_uptodate(struct folio *folio, unsigned int block)
> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +
> +	return test_bit(block, iop->state);
> +}
> +
> +static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
> +				   size_t off, size_t len)
> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +	unsigned int first_blk = off >> inode->i_blkbits;
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	bitmap_set(iop->state, first_blk, nr_blks);
> +	if (iop_test_full_uptodate(folio))
> +		folio_mark_uptodate(folio);
> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_iop_set_range_uptodate(struct inode *inode,

I don't understand why iomap_set_range_uptodate is now
iomap_iop_set_range_uptodate; it doesn't take an iomap_page object as an
argument...?

I thought I understood that iomap_FOO operates on a folio and a range,
whereas iomap_iop_FOO operates on sub-blocks within a folio?  And that
you were renaming the iop_* functions to iomap_iop_*?

I'm also not sure why iop_set_range_uptodate needs to be passed the
struct inode; can't it extract that from folio->mapping->host, like
current upstream does?

Generally I don't understand why this part of the patch is needed at
all.  Wasn't the point merely to rename uptodate_* to state_* and
introduce the iomap_iop_test_*_uptodate helpers?

--D

> +		struct folio *folio, size_t off, size_t len)
> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +
> +	if (iop)
> +		iop_set_range_uptodate(inode, folio, off, len);
> +	else
> +		folio_mark_uptodate(folio);
> +}
> +
>  static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>  				struct folio *folio, unsigned int flags)
>  {
> @@ -58,12 +100,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>  	else
>  		gfp = GFP_NOFS | __GFP_NOFAIL;
>  
> -	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
> +	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
>  		      gfp);
>  	if (iop) {
> -		spin_lock_init(&iop->uptodate_lock);
> +		spin_lock_init(&iop->state_lock);
>  		if (folio_test_uptodate(folio))
> -			bitmap_fill(iop->uptodate, nr_blocks);
> +			bitmap_fill(iop->state, nr_blocks);
>  		folio_attach_private(folio, iop);
>  	}
>  	return iop;
> @@ -72,14 +114,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>  static void iomap_iop_free(struct folio *folio)
>  {
>  	struct iomap_page *iop = to_iomap_page(folio);
> -	struct inode *inode = folio->mapping->host;
> -	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
>  
>  	if (!iop)
>  		return;
>  	WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
>  	WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
> -	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> +	WARN_ON_ONCE(iop_test_full_uptodate(folio) !=
>  			folio_test_uptodate(folio));
>  	folio_detach_private(folio);
>  	kfree(iop);
> @@ -111,7 +151,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>  
>  		/* move forward for each leading block marked uptodate */
>  		for (i = first; i <= last; i++) {
> -			if (!test_bit(i, iop->uptodate))
> +			if (!iop_test_block_uptodate(folio, i))
>  				break;
>  			*pos += block_size;
>  			poff += block_size;
> @@ -121,7 +161,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>  
>  		/* truncate len if we find any trailing uptodate block(s) */
>  		for ( ; i <= last; i++) {
> -			if (test_bit(i, iop->uptodate)) {
> +			if (iop_test_block_uptodate(folio, i)) {
>  				plen -= (last - i + 1) * block_size;
>  				last = i - 1;
>  				break;
> @@ -145,30 +185,6 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>  	*lenp = plen;
>  }
>  
> -static void iomap_iop_set_range_uptodate(struct folio *folio,
> -		struct iomap_page *iop, size_t off, size_t len)
> -{
> -	struct inode *inode = folio->mapping->host;
> -	unsigned first = off >> inode->i_blkbits;
> -	unsigned last = (off + len - 1) >> inode->i_blkbits;
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&iop->uptodate_lock, flags);
> -	bitmap_set(iop->uptodate, first, last - first + 1);
> -	if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
> -		folio_mark_uptodate(folio);
> -	spin_unlock_irqrestore(&iop->uptodate_lock, flags);
> -}
> -
> -static void iomap_set_range_uptodate(struct folio *folio,
> -		struct iomap_page *iop, size_t off, size_t len)
> -{
> -	if (iop)
> -		iomap_iop_set_range_uptodate(folio, iop, off, len);
> -	else
> -		folio_mark_uptodate(folio);
> -}
> -
>  static void iomap_finish_folio_read(struct folio *folio, size_t offset,
>  		size_t len, int error)
>  {
> @@ -178,7 +194,8 @@ static void iomap_finish_folio_read(struct folio *folio, size_t offset,
>  		folio_clear_uptodate(folio);
>  		folio_set_error(folio);
>  	} else {
> -		iomap_set_range_uptodate(folio, iop, offset, len);
> +		iomap_iop_set_range_uptodate(folio->mapping->host, folio,
> +					     offset, len);
>  	}
>  
>  	if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
> @@ -214,7 +231,6 @@ struct iomap_readpage_ctx {
>  static int iomap_read_inline_data(const struct iomap_iter *iter,
>  		struct folio *folio)
>  {
> -	struct iomap_page *iop;
>  	const struct iomap *iomap = iomap_iter_srcmap(iter);
>  	size_t size = i_size_read(iter->inode) - iomap->offset;
>  	size_t poff = offset_in_page(iomap->offset);
> @@ -232,15 +248,14 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
>  	if (WARN_ON_ONCE(size > iomap->length))
>  		return -EIO;
>  	if (offset > 0)
> -		iop = iomap_iop_alloc(iter->inode, folio, iter->flags);
> -	else
> -		iop = to_iomap_page(folio);
> +		iomap_iop_alloc(iter->inode, folio, iter->flags);
>  
>  	addr = kmap_local_folio(folio, offset);
>  	memcpy(addr, iomap->inline_data, size);
>  	memset(addr + size, 0, PAGE_SIZE - poff - size);
>  	kunmap_local(addr);
> -	iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
> +	iomap_iop_set_range_uptodate(iter->inode, folio, offset,
> +				     PAGE_SIZE - poff);
>  	return 0;
>  }
>  
> @@ -277,7 +292,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>  
>  	if (iomap_block_needs_zeroing(iter, pos)) {
>  		folio_zero_range(folio, poff, plen);
> -		iomap_set_range_uptodate(folio, iop, poff, plen);
> +		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
>  		goto done;
>  	}
>  
> @@ -452,7 +467,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
>  	last = (from + count - 1) >> inode->i_blkbits;
>  
>  	for (i = first; i <= last; i++)
> -		if (!test_bit(i, iop->uptodate))
> +		if (!iop_test_block_uptodate(folio, i))
>  			return false;
>  	return true;
>  }
> @@ -591,7 +606,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>  			if (status)
>  				return status;
>  		}
> -		iomap_set_range_uptodate(folio, iop, poff, plen);
> +		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
>  	} while ((block_start += plen) < block_end);
>  
>  	return 0;
> @@ -698,7 +713,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
>  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>  		size_t copied, struct folio *folio)
>  {
> -	struct iomap_page *iop = to_iomap_page(folio);
>  	flush_dcache_folio(folio);
>  
>  	/*
> @@ -714,7 +728,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>  	 */
>  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
>  		return 0;
> -	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
> +	iomap_iop_set_range_uptodate(inode, folio, offset_in_folio(folio, pos),
> +				     len);
>  	filemap_dirty_folio(inode->i_mapping, folio);
>  	return copied;
>  }
> @@ -1630,7 +1645,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * invalid, grab a new one.
>  	 */
>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -		if (iop && !test_bit(i, iop->uptodate))
> +		if (iop && !iop_test_block_uptodate(folio, i))
>  			continue;
>  
>  		error = wpc->ops->map_blocks(wpc, inode, pos);
> -- 
> 2.40.1
>
Ritesh Harjani (IBM) June 5, 2023, 11:51 p.m. UTC | #7
"Darrick J. Wong" <djwong@kernel.org> writes:

> On Mon, Jun 05, 2023 at 04:25:03PM +0530, Ritesh Harjani (IBM) wrote:
>> We would eventually use iomap_iop_** function naming by the rest of the
>> buffered-io iomap code. This patch update function arguments and naming
>> from iomap_set_range_uptodate() -> iomap_iop_set_range_uptodate().
>> iop_set_range_uptodate() then becomes an accessor function used by
>> iomap_iop_** functions.
>>
>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>> ---
>>  fs/iomap/buffered-io.c | 111 +++++++++++++++++++++++------------------
>>  1 file changed, 63 insertions(+), 48 deletions(-)
>>
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index 6fffda355c45..136f57ccd0be 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -24,14 +24,14 @@
>>  #define IOEND_BATCH_SIZE	4096
>>
>>  /*
>> - * Structure allocated for each folio when block size < folio size
>> - * to track sub-folio uptodate status and I/O completions.
>> + * Structure allocated for each folio to track per-block uptodate state
>> + * and I/O completions.
>>   */
>>  struct iomap_page {
>>  	atomic_t		read_bytes_pending;
>>  	atomic_t		write_bytes_pending;
>> -	spinlock_t		uptodate_lock;
>> -	unsigned long		uptodate[];
>> +	spinlock_t		state_lock;
>> +	unsigned long		state[];
>>  };
>>
>>  static inline struct iomap_page *to_iomap_page(struct folio *folio)
>> @@ -43,6 +43,48 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
>>
>>  static struct bio_set iomap_ioend_bioset;
>>
>> +static bool iop_test_full_uptodate(struct folio *folio)
>
> Same comment as Andreas, I think this works better with 'fully', e.g.
>
> iop_test_fully_uptodate()
>
> Why you don't pass the iomap_page directly into this function?  Doesn't
> that eliminate the need for iomap_iop_free to keep folio->private set
> until the very end?
>
> static inline bool
> iomap_iop_is_fully_uptodate(const struct iomap_page *iop,
> 			    const struct folio *folio)
>
> Same sort of thing for the second function -- we already extracted
> folio->private and checked it wasn't null, so we don't need to do that
> again.
>
> static inline bool
> iomap_io_is_block_uptodate(const struct iomap_page *iop,
> 			   const struct folio *folio,
> 			   unsigned int block)
>
>> +{
>> +	struct iomap_page *iop = to_iomap_page(folio);
>> +	struct inode *inode = folio->mapping->host;
>> +
>> +	return bitmap_full(iop->state, i_blocks_per_folio(inode, folio));
>> +}
>> +
>> +static bool iop_test_block_uptodate(struct folio *folio, unsigned int block)
>> +{
>> +	struct iomap_page *iop = to_iomap_page(folio);
>> +
>> +	return test_bit(block, iop->state);
>> +}
>> +
>> +static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
>> +				   size_t off, size_t len)
>> +{
>> +	struct iomap_page *iop = to_iomap_page(folio);
>> +	unsigned int first_blk = off >> inode->i_blkbits;
>> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> +	unsigned int nr_blks = last_blk - first_blk + 1;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&iop->state_lock, flags);
>> +	bitmap_set(iop->state, first_blk, nr_blks);
>> +	if (iop_test_full_uptodate(folio))
>> +		folio_mark_uptodate(folio);
>> +	spin_unlock_irqrestore(&iop->state_lock, flags);
>> +}
>> +
>> +static void iomap_iop_set_range_uptodate(struct inode *inode,
>
> I don't understand why iomap_set_range_uptodate is now
> iomap_iop_set_range_uptodate; it doesn't take an iomap_page object as an
> argument...?
>
> I thought I understood that iomap_FOO operates on a folio and a range,
> whereas iomap_iop_FOO operates on sub-blocks within a folio?  And that
> you were renaming the iop_* functions to iomap_iop_*?
>
> I'm also not sure why iop_set_range_uptodate needs to be passed the
> struct inode; can't it extract that from folio->mapping->host, like
> current upstream does?
>

So, I do have a confusion in __folio_mark_dirty() function...

i.e. __folio_mark_dirty checks whether folio->mapping is not NULL.
That means for marking range of blocks dirty within iop from
->dirty_folio(), we can't use folio->mapping->host is it?
We have to use inode from mapping->host (mapping is passed as a
parameter in ->dirty_folio).

I tried looking into the history of this, but I couldn't find any.
This is also the reason I thought we would need to pass an inode for the
iop_set_range_dirty() function (because it is also called from
->dirty_folio -> iomap_dirty_folio()) and to keep it consistent, I kept
inode argument for iop_**_uptodate functions as well.

Thoughts please?

<some code snippet>
====================

iomap_dirty_folio -> filemap_dirty_folio-> __folio_mark_dirty()

void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
			     int warn)
{
	unsigned long flags;

	xa_lock_irqsave(&mapping->i_pages, flags);
	if (folio->mapping) {	/* Race with truncate? */
		WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
		folio_account_dirtied(folio, mapping);
		__xa_set_mark(&mapping->i_pages, folio_index(folio),
				PAGECACHE_TAG_DIRTY);
	}
	xa_unlock_irqrestore(&mapping->i_pages, flags);
}

-ritesh



> Generally I don't understand why this part of the patch is needed at
> all.  Wasn't the point merely to rename uptodate_* to state_* and
> introduce the iomap_iop_test_*_uptodate helpers?
>
> --D
>
>> +		struct folio *folio, size_t off, size_t len)
>> +{
>> +	struct iomap_page *iop = to_iomap_page(folio);
>> +
>> +	if (iop)
>> +		iop_set_range_uptodate(inode, folio, off, len);
>> +	else
>> +		folio_mark_uptodate(folio);
>> +}
>> +
>>  static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>>  				struct folio *folio, unsigned int flags)
>>  {
>> @@ -58,12 +100,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>>  	else
>>  		gfp = GFP_NOFS | __GFP_NOFAIL;
>>
>> -	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
>> +	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
>>  		      gfp);
>>  	if (iop) {
>> -		spin_lock_init(&iop->uptodate_lock);
>> +		spin_lock_init(&iop->state_lock);
>>  		if (folio_test_uptodate(folio))
>> -			bitmap_fill(iop->uptodate, nr_blocks);
>> +			bitmap_fill(iop->state, nr_blocks);
>>  		folio_attach_private(folio, iop);
>>  	}
>>  	return iop;
>> @@ -72,14 +114,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
>>  static void iomap_iop_free(struct folio *folio)
>>  {
>>  	struct iomap_page *iop = to_iomap_page(folio);
>> -	struct inode *inode = folio->mapping->host;
>> -	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
>>
>>  	if (!iop)
>>  		return;
>>  	WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
>>  	WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
>> -	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
>> +	WARN_ON_ONCE(iop_test_full_uptodate(folio) !=
>>  			folio_test_uptodate(folio));
>>  	folio_detach_private(folio);
>>  	kfree(iop);
>> @@ -111,7 +151,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>>
>>  		/* move forward for each leading block marked uptodate */
>>  		for (i = first; i <= last; i++) {
>> -			if (!test_bit(i, iop->uptodate))
>> +			if (!iop_test_block_uptodate(folio, i))
>>  				break;
>>  			*pos += block_size;
>>  			poff += block_size;
>> @@ -121,7 +161,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>>
>>  		/* truncate len if we find any trailing uptodate block(s) */
>>  		for ( ; i <= last; i++) {
>> -			if (test_bit(i, iop->uptodate)) {
>> +			if (iop_test_block_uptodate(folio, i)) {
>>  				plen -= (last - i + 1) * block_size;
>>  				last = i - 1;
>>  				break;
>> @@ -145,30 +185,6 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
>>  	*lenp = plen;
>>  }
>>
>> -static void iomap_iop_set_range_uptodate(struct folio *folio,
>> -		struct iomap_page *iop, size_t off, size_t len)
>> -{
>> -	struct inode *inode = folio->mapping->host;
>> -	unsigned first = off >> inode->i_blkbits;
>> -	unsigned last = (off + len - 1) >> inode->i_blkbits;
>> -	unsigned long flags;
>> -
>> -	spin_lock_irqsave(&iop->uptodate_lock, flags);
>> -	bitmap_set(iop->uptodate, first, last - first + 1);
>> -	if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
>> -		folio_mark_uptodate(folio);
>> -	spin_unlock_irqrestore(&iop->uptodate_lock, flags);
>> -}
>> -
>> -static void iomap_set_range_uptodate(struct folio *folio,
>> -		struct iomap_page *iop, size_t off, size_t len)
>> -{
>> -	if (iop)
>> -		iomap_iop_set_range_uptodate(folio, iop, off, len);
>> -	else
>> -		folio_mark_uptodate(folio);
>> -}
>> -
>>  static void iomap_finish_folio_read(struct folio *folio, size_t offset,
>>  		size_t len, int error)
>>  {
>> @@ -178,7 +194,8 @@ static void iomap_finish_folio_read(struct folio *folio, size_t offset,
>>  		folio_clear_uptodate(folio);
>>  		folio_set_error(folio);
>>  	} else {
>> -		iomap_set_range_uptodate(folio, iop, offset, len);
>> +		iomap_iop_set_range_uptodate(folio->mapping->host, folio,
>> +					     offset, len);
>>  	}
>>
>>  	if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
>> @@ -214,7 +231,6 @@ struct iomap_readpage_ctx {
>>  static int iomap_read_inline_data(const struct iomap_iter *iter,
>>  		struct folio *folio)
>>  {
>> -	struct iomap_page *iop;
>>  	const struct iomap *iomap = iomap_iter_srcmap(iter);
>>  	size_t size = i_size_read(iter->inode) - iomap->offset;
>>  	size_t poff = offset_in_page(iomap->offset);
>> @@ -232,15 +248,14 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
>>  	if (WARN_ON_ONCE(size > iomap->length))
>>  		return -EIO;
>>  	if (offset > 0)
>> -		iop = iomap_iop_alloc(iter->inode, folio, iter->flags);
>> -	else
>> -		iop = to_iomap_page(folio);
>> +		iomap_iop_alloc(iter->inode, folio, iter->flags);
>>
>>  	addr = kmap_local_folio(folio, offset);
>>  	memcpy(addr, iomap->inline_data, size);
>>  	memset(addr + size, 0, PAGE_SIZE - poff - size);
>>  	kunmap_local(addr);
>> -	iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
>> +	iomap_iop_set_range_uptodate(iter->inode, folio, offset,
>> +				     PAGE_SIZE - poff);
>>  	return 0;
>>  }
>>
>> @@ -277,7 +292,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
>>
>>  	if (iomap_block_needs_zeroing(iter, pos)) {
>>  		folio_zero_range(folio, poff, plen);
>> -		iomap_set_range_uptodate(folio, iop, poff, plen);
>> +		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
>>  		goto done;
>>  	}
>>
>> @@ -452,7 +467,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
>>  	last = (from + count - 1) >> inode->i_blkbits;
>>
>>  	for (i = first; i <= last; i++)
>> -		if (!test_bit(i, iop->uptodate))
>> +		if (!iop_test_block_uptodate(folio, i))
>>  			return false;
>>  	return true;
>>  }
>> @@ -591,7 +606,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
>>  			if (status)
>>  				return status;
>>  		}
>> -		iomap_set_range_uptodate(folio, iop, poff, plen);
>> +		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
>>  	} while ((block_start += plen) < block_end);
>>
>>  	return 0;
>> @@ -698,7 +713,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
>>  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>>  		size_t copied, struct folio *folio)
>>  {
>> -	struct iomap_page *iop = to_iomap_page(folio);
>>  	flush_dcache_folio(folio);
>>
>>  	/*
>> @@ -714,7 +728,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>>  	 */
>>  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
>>  		return 0;
>> -	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
>> +	iomap_iop_set_range_uptodate(inode, folio, offset_in_folio(folio, pos),
>> +				     len);
>>  	filemap_dirty_folio(inode->i_mapping, folio);
>>  	return copied;
>>  }
>> @@ -1630,7 +1645,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>>  	 * invalid, grab a new one.
>>  	 */
>>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
>> -		if (iop && !test_bit(i, iop->uptodate))
>> +		if (iop && !iop_test_block_uptodate(folio, i))
>>  			continue;
>>
>>  		error = wpc->ops->map_blocks(wpc, inode, pos);
>> --
>> 2.40.1
>>
Darrick J. Wong June 6, 2023, 4:03 p.m. UTC | #8
On Tue, Jun 06, 2023 at 05:21:32AM +0530, Ritesh Harjani wrote:
> "Darrick J. Wong" <djwong@kernel.org> writes:
> 
> > On Mon, Jun 05, 2023 at 04:25:03PM +0530, Ritesh Harjani (IBM) wrote:
> >> We would eventually use iomap_iop_** function naming by the rest of the
> >> buffered-io iomap code. This patch update function arguments and naming
> >> from iomap_set_range_uptodate() -> iomap_iop_set_range_uptodate().
> >> iop_set_range_uptodate() then becomes an accessor function used by
> >> iomap_iop_** functions.
> >>
> >> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> >> ---
> >>  fs/iomap/buffered-io.c | 111 +++++++++++++++++++++++------------------
> >>  1 file changed, 63 insertions(+), 48 deletions(-)
> >>
> >> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> >> index 6fffda355c45..136f57ccd0be 100644
> >> --- a/fs/iomap/buffered-io.c
> >> +++ b/fs/iomap/buffered-io.c
> >> @@ -24,14 +24,14 @@
> >>  #define IOEND_BATCH_SIZE	4096
> >>
> >>  /*
> >> - * Structure allocated for each folio when block size < folio size
> >> - * to track sub-folio uptodate status and I/O completions.
> >> + * Structure allocated for each folio to track per-block uptodate state
> >> + * and I/O completions.
> >>   */
> >>  struct iomap_page {
> >>  	atomic_t		read_bytes_pending;
> >>  	atomic_t		write_bytes_pending;
> >> -	spinlock_t		uptodate_lock;
> >> -	unsigned long		uptodate[];
> >> +	spinlock_t		state_lock;
> >> +	unsigned long		state[];
> >>  };
> >>
> >>  static inline struct iomap_page *to_iomap_page(struct folio *folio)
> >> @@ -43,6 +43,48 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
> >>
> >>  static struct bio_set iomap_ioend_bioset;
> >>
> >> +static bool iop_test_full_uptodate(struct folio *folio)
> >
> > Same comment as Andreas, I think this works better with 'fully', e.g.
> >
> > iop_test_fully_uptodate()
> >
> > Why you don't pass the iomap_page directly into this function?  Doesn't
> > that eliminate the need for iomap_iop_free to keep folio->private set
> > until the very end?
> >
> > static inline bool
> > iomap_iop_is_fully_uptodate(const struct iomap_page *iop,
> > 			    const struct folio *folio)
> >
> > Same sort of thing for the second function -- we already extracted
> > folio->private and checked it wasn't null, so we don't need to do that
> > again.
> >
> > static inline bool
> > iomap_io_is_block_uptodate(const struct iomap_page *iop,
> > 			   const struct folio *folio,
> > 			   unsigned int block)
> >
> >> +{
> >> +	struct iomap_page *iop = to_iomap_page(folio);
> >> +	struct inode *inode = folio->mapping->host;
> >> +
> >> +	return bitmap_full(iop->state, i_blocks_per_folio(inode, folio));
> >> +}
> >> +
> >> +static bool iop_test_block_uptodate(struct folio *folio, unsigned int block)
> >> +{
> >> +	struct iomap_page *iop = to_iomap_page(folio);
> >> +
> >> +	return test_bit(block, iop->state);
> >> +}
> >> +
> >> +static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
> >> +				   size_t off, size_t len)
> >> +{
> >> +	struct iomap_page *iop = to_iomap_page(folio);
> >> +	unsigned int first_blk = off >> inode->i_blkbits;
> >> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> >> +	unsigned int nr_blks = last_blk - first_blk + 1;
> >> +	unsigned long flags;
> >> +
> >> +	spin_lock_irqsave(&iop->state_lock, flags);
> >> +	bitmap_set(iop->state, first_blk, nr_blks);
> >> +	if (iop_test_full_uptodate(folio))
> >> +		folio_mark_uptodate(folio);
> >> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> >> +}
> >> +
> >> +static void iomap_iop_set_range_uptodate(struct inode *inode,
> >
> > I don't understand why iomap_set_range_uptodate is now
> > iomap_iop_set_range_uptodate; it doesn't take an iomap_page object as an
> > argument...?
> >
> > I thought I understood that iomap_FOO operates on a folio and a range,
> > whereas iomap_iop_FOO operates on sub-blocks within a folio?  And that
> > you were renaming the iop_* functions to iomap_iop_*?
> >
> > I'm also not sure why iop_set_range_uptodate needs to be passed the
> > struct inode; can't it extract that from folio->mapping->host, like
> > current upstream does?
> >
> 
> So, I do have a confusion in __folio_mark_dirty() function...
> 
> i.e. __folio_mark_dirty checks whether folio->mapping is not NULL.
> That means for marking range of blocks dirty within iop from
> ->dirty_folio(), we can't use folio->mapping->host is it?
> We have to use inode from mapping->host (mapping is passed as a
> parameter in ->dirty_folio).

Ah, yeah.  folio->mapping can become NULL if truncate races with us in
removing the folio from the foliocache.

For regular reads and writes this is a nonissue because those paths all
take i_rwsem and will block truncate.  However, for page_mkwrite, xfs
doesn't take mmap_invalidate_lock until after the vm_fault has been
given a folio to play with.

I think that means we can't rely on folio->mapping to be non-null, and
hence can't dereference folio->mapping->host.  That said, if the folio's
locked and folio->mapping is null, the page has been truncated so we
could just return VM_FAULT_NOPAGE, perhaps?

(willy might know better than I do...)

> I tried looking into the history of this, but I couldn't find any.
> This is also the reason I thought we would need to pass an inode for the
> iop_set_range_dirty() function (because it is also called from
> ->dirty_folio -> iomap_dirty_folio()) and to keep it consistent, I kept
> inode argument for iop_**_uptodate functions as well.

That seems ok to me, now that you've brought that ^^ to my attention.

--D

> Thoughts please?
> 
> <some code snippet>
> ====================
> 
> iomap_dirty_folio -> filemap_dirty_folio-> __folio_mark_dirty()
> 
> void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
> 			     int warn)
> {
> 	unsigned long flags;
> 
> 	xa_lock_irqsave(&mapping->i_pages, flags);
> 	if (folio->mapping) {	/* Race with truncate? */
> 		WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
> 		folio_account_dirtied(folio, mapping);
> 		__xa_set_mark(&mapping->i_pages, folio_index(folio),
> 				PAGECACHE_TAG_DIRTY);
> 	}
> 	xa_unlock_irqrestore(&mapping->i_pages, flags);
> }
> 
> -ritesh
> 
> 
> 
> > Generally I don't understand why this part of the patch is needed at
> > all.  Wasn't the point merely to rename uptodate_* to state_* and
> > introduce the iomap_iop_test_*_uptodate helpers?
> >
> > --D
> >
> >> +		struct folio *folio, size_t off, size_t len)
> >> +{
> >> +	struct iomap_page *iop = to_iomap_page(folio);
> >> +
> >> +	if (iop)
> >> +		iop_set_range_uptodate(inode, folio, off, len);
> >> +	else
> >> +		folio_mark_uptodate(folio);
> >> +}
> >> +
> >>  static struct iomap_page *iomap_iop_alloc(struct inode *inode,
> >>  				struct folio *folio, unsigned int flags)
> >>  {
> >> @@ -58,12 +100,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
> >>  	else
> >>  		gfp = GFP_NOFS | __GFP_NOFAIL;
> >>
> >> -	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
> >> +	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
> >>  		      gfp);
> >>  	if (iop) {
> >> -		spin_lock_init(&iop->uptodate_lock);
> >> +		spin_lock_init(&iop->state_lock);
> >>  		if (folio_test_uptodate(folio))
> >> -			bitmap_fill(iop->uptodate, nr_blocks);
> >> +			bitmap_fill(iop->state, nr_blocks);
> >>  		folio_attach_private(folio, iop);
> >>  	}
> >>  	return iop;
> >> @@ -72,14 +114,12 @@ static struct iomap_page *iomap_iop_alloc(struct inode *inode,
> >>  static void iomap_iop_free(struct folio *folio)
> >>  {
> >>  	struct iomap_page *iop = to_iomap_page(folio);
> >> -	struct inode *inode = folio->mapping->host;
> >> -	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> >>
> >>  	if (!iop)
> >>  		return;
> >>  	WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
> >>  	WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
> >> -	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
> >> +	WARN_ON_ONCE(iop_test_full_uptodate(folio) !=
> >>  			folio_test_uptodate(folio));
> >>  	folio_detach_private(folio);
> >>  	kfree(iop);
> >> @@ -111,7 +151,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
> >>
> >>  		/* move forward for each leading block marked uptodate */
> >>  		for (i = first; i <= last; i++) {
> >> -			if (!test_bit(i, iop->uptodate))
> >> +			if (!iop_test_block_uptodate(folio, i))
> >>  				break;
> >>  			*pos += block_size;
> >>  			poff += block_size;
> >> @@ -121,7 +161,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
> >>
> >>  		/* truncate len if we find any trailing uptodate block(s) */
> >>  		for ( ; i <= last; i++) {
> >> -			if (test_bit(i, iop->uptodate)) {
> >> +			if (iop_test_block_uptodate(folio, i)) {
> >>  				plen -= (last - i + 1) * block_size;
> >>  				last = i - 1;
> >>  				break;
> >> @@ -145,30 +185,6 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
> >>  	*lenp = plen;
> >>  }
> >>
> >> -static void iomap_iop_set_range_uptodate(struct folio *folio,
> >> -		struct iomap_page *iop, size_t off, size_t len)
> >> -{
> >> -	struct inode *inode = folio->mapping->host;
> >> -	unsigned first = off >> inode->i_blkbits;
> >> -	unsigned last = (off + len - 1) >> inode->i_blkbits;
> >> -	unsigned long flags;
> >> -
> >> -	spin_lock_irqsave(&iop->uptodate_lock, flags);
> >> -	bitmap_set(iop->uptodate, first, last - first + 1);
> >> -	if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
> >> -		folio_mark_uptodate(folio);
> >> -	spin_unlock_irqrestore(&iop->uptodate_lock, flags);
> >> -}
> >> -
> >> -static void iomap_set_range_uptodate(struct folio *folio,
> >> -		struct iomap_page *iop, size_t off, size_t len)
> >> -{
> >> -	if (iop)
> >> -		iomap_iop_set_range_uptodate(folio, iop, off, len);
> >> -	else
> >> -		folio_mark_uptodate(folio);
> >> -}
> >> -
> >>  static void iomap_finish_folio_read(struct folio *folio, size_t offset,
> >>  		size_t len, int error)
> >>  {
> >> @@ -178,7 +194,8 @@ static void iomap_finish_folio_read(struct folio *folio, size_t offset,
> >>  		folio_clear_uptodate(folio);
> >>  		folio_set_error(folio);
> >>  	} else {
> >> -		iomap_set_range_uptodate(folio, iop, offset, len);
> >> +		iomap_iop_set_range_uptodate(folio->mapping->host, folio,
> >> +					     offset, len);
> >>  	}
> >>
> >>  	if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
> >> @@ -214,7 +231,6 @@ struct iomap_readpage_ctx {
> >>  static int iomap_read_inline_data(const struct iomap_iter *iter,
> >>  		struct folio *folio)
> >>  {
> >> -	struct iomap_page *iop;
> >>  	const struct iomap *iomap = iomap_iter_srcmap(iter);
> >>  	size_t size = i_size_read(iter->inode) - iomap->offset;
> >>  	size_t poff = offset_in_page(iomap->offset);
> >> @@ -232,15 +248,14 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
> >>  	if (WARN_ON_ONCE(size > iomap->length))
> >>  		return -EIO;
> >>  	if (offset > 0)
> >> -		iop = iomap_iop_alloc(iter->inode, folio, iter->flags);
> >> -	else
> >> -		iop = to_iomap_page(folio);
> >> +		iomap_iop_alloc(iter->inode, folio, iter->flags);
> >>
> >>  	addr = kmap_local_folio(folio, offset);
> >>  	memcpy(addr, iomap->inline_data, size);
> >>  	memset(addr + size, 0, PAGE_SIZE - poff - size);
> >>  	kunmap_local(addr);
> >> -	iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
> >> +	iomap_iop_set_range_uptodate(iter->inode, folio, offset,
> >> +				     PAGE_SIZE - poff);
> >>  	return 0;
> >>  }
> >>
> >> @@ -277,7 +292,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
> >>
> >>  	if (iomap_block_needs_zeroing(iter, pos)) {
> >>  		folio_zero_range(folio, poff, plen);
> >> -		iomap_set_range_uptodate(folio, iop, poff, plen);
> >> +		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
> >>  		goto done;
> >>  	}
> >>
> >> @@ -452,7 +467,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
> >>  	last = (from + count - 1) >> inode->i_blkbits;
> >>
> >>  	for (i = first; i <= last; i++)
> >> -		if (!test_bit(i, iop->uptodate))
> >> +		if (!iop_test_block_uptodate(folio, i))
> >>  			return false;
> >>  	return true;
> >>  }
> >> @@ -591,7 +606,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> >>  			if (status)
> >>  				return status;
> >>  		}
> >> -		iomap_set_range_uptodate(folio, iop, poff, plen);
> >> +		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
> >>  	} while ((block_start += plen) < block_end);
> >>
> >>  	return 0;
> >> @@ -698,7 +713,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
> >>  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
> >>  		size_t copied, struct folio *folio)
> >>  {
> >> -	struct iomap_page *iop = to_iomap_page(folio);
> >>  	flush_dcache_folio(folio);
> >>
> >>  	/*
> >> @@ -714,7 +728,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
> >>  	 */
> >>  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
> >>  		return 0;
> >> -	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
> >> +	iomap_iop_set_range_uptodate(inode, folio, offset_in_folio(folio, pos),
> >> +				     len);
> >>  	filemap_dirty_folio(inode->i_mapping, folio);
> >>  	return copied;
> >>  }
> >> @@ -1630,7 +1645,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> >>  	 * invalid, grab a new one.
> >>  	 */
> >>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> >> -		if (iop && !test_bit(i, iop->uptodate))
> >> +		if (iop && !iop_test_block_uptodate(folio, i))
> >>  			continue;
> >>
> >>  		error = wpc->ops->map_blocks(wpc, inode, pos);
> >> --
> >> 2.40.1
> >>
Matthew Wilcox June 6, 2023, 4:29 p.m. UTC | #9
On Tue, Jun 06, 2023 at 09:03:17AM -0700, Darrick J. Wong wrote:
> On Tue, Jun 06, 2023 at 05:21:32AM +0530, Ritesh Harjani wrote:
> > So, I do have a confusion in __folio_mark_dirty() function...
> > 
> > i.e. __folio_mark_dirty checks whether folio->mapping is not NULL.
> > That means for marking range of blocks dirty within iop from
> > ->dirty_folio(), we can't use folio->mapping->host is it?
> > We have to use inode from mapping->host (mapping is passed as a
> > parameter in ->dirty_folio).

It probably helps to read the commentary above filemap_dirty_folio().

 * The caller must ensure this doesn't race with truncation.  Most will
 * simply hold the folio lock, but e.g. zap_pte_range() calls with the
 * folio mapped and the pte lock held, which also locks out truncation.

But __folio_mark_dirty() can't rely on that!  Again, see the commentary:

 * This can also be called from mark_buffer_dirty(), which I
 * cannot prove is always protected against truncate.

iomap doesn't do bottom-up dirtying, only top-down.  So it absolutely
can rely on the VFS having taken the appropriate locks.

> Ah, yeah.  folio->mapping can become NULL if truncate races with us in
> removing the folio from the foliocache.
> 
> For regular reads and writes this is a nonissue because those paths all
> take i_rwsem and will block truncate.  However, for page_mkwrite, xfs
> doesn't take mmap_invalidate_lock until after the vm_fault has been
> given a folio to play with.

invalidate_lock isn't needed here.  You take the folio_lock, then you
call folio_mkwrite_check_truncate() to make sure it wasn't truncated
before you took the folio_lock.  Truncation will block on the folio_lock,
so you're good unless you release the folio_lock (which you don't,
you return it to the MM locked).
Ritesh Harjani (IBM) June 7, 2023, 1:08 p.m. UTC | #10
Matthew Wilcox <willy@infradead.org> writes:

> On Tue, Jun 06, 2023 at 09:03:17AM -0700, Darrick J. Wong wrote:
>> On Tue, Jun 06, 2023 at 05:21:32AM +0530, Ritesh Harjani wrote:
>> > So, I do have a confusion in __folio_mark_dirty() function...
>> > 
>> > i.e. __folio_mark_dirty checks whether folio->mapping is not NULL.
>> > That means for marking range of blocks dirty within iop from
>> > ->dirty_folio(), we can't use folio->mapping->host is it?
>> > We have to use inode from mapping->host (mapping is passed as a
>> > parameter in ->dirty_folio).
>
> It probably helps to read the commentary above filemap_dirty_folio().
>
>  * The caller must ensure this doesn't race with truncation.  Most will
>  * simply hold the folio lock, but e.g. zap_pte_range() calls with the
>  * folio mapped and the pte lock held, which also locks out truncation.
>
> But __folio_mark_dirty() can't rely on that!  Again, see the commentary:
>
>  * This can also be called from mark_buffer_dirty(), which I
>  * cannot prove is always protected against truncate.
>
> iomap doesn't do bottom-up dirtying, only top-down.  So it absolutely
> can rely on the VFS having taken the appropriate locks.
>

Right.

>> Ah, yeah.  folio->mapping can become NULL if truncate races with us in
>> removing the folio from the foliocache.
>> 
>> For regular reads and writes this is a nonissue because those paths all
>> take i_rwsem and will block truncate.  However, for page_mkwrite, xfs
>> doesn't take mmap_invalidate_lock until after the vm_fault has been
>> given a folio to play with.
>
> invalidate_lock isn't needed here.  You take the folio_lock, then you
> call folio_mkwrite_check_truncate() to make sure it wasn't truncated
> before you took the folio_lock.  Truncation will block on the folio_lock,
> so you're good unless you release the folio_lock (which you don't,
> you return it to the MM locked).

ohhk. Thanks for explaining this. So most callers hold the folio_lock()
which prevents agains the race from truncation while calling
->dirty_folio(). Some of the callers cannot use folio_lock() so instead
they hold the page table lock which can as well prevent against
truncation.

So I can just go ahead and use folio->mapping->host in
iomap_dirty_folio() function as well. 

Thanks a lot!! This helped. Will drop the inode from the function
argument then and will use folio->mapping->host instead. 

-ritesh
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 6fffda355c45..136f57ccd0be 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -24,14 +24,14 @@ 
 #define IOEND_BATCH_SIZE	4096
 
 /*
- * Structure allocated for each folio when block size < folio size
- * to track sub-folio uptodate status and I/O completions.
+ * Structure allocated for each folio to track per-block uptodate state
+ * and I/O completions.
  */
 struct iomap_page {
 	atomic_t		read_bytes_pending;
 	atomic_t		write_bytes_pending;
-	spinlock_t		uptodate_lock;
-	unsigned long		uptodate[];
+	spinlock_t		state_lock;
+	unsigned long		state[];
 };
 
 static inline struct iomap_page *to_iomap_page(struct folio *folio)
@@ -43,6 +43,48 @@  static inline struct iomap_page *to_iomap_page(struct folio *folio)
 
 static struct bio_set iomap_ioend_bioset;
 
+static bool iop_test_full_uptodate(struct folio *folio)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+	struct inode *inode = folio->mapping->host;
+
+	return bitmap_full(iop->state, i_blocks_per_folio(inode, folio));
+}
+
+static bool iop_test_block_uptodate(struct folio *folio, unsigned int block)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+
+	return test_bit(block, iop->state);
+}
+
+static void iop_set_range_uptodate(struct inode *inode, struct folio *folio,
+				   size_t off, size_t len)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+	unsigned int first_blk = off >> inode->i_blkbits;
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iop->state_lock, flags);
+	bitmap_set(iop->state, first_blk, nr_blks);
+	if (iop_test_full_uptodate(folio))
+		folio_mark_uptodate(folio);
+	spin_unlock_irqrestore(&iop->state_lock, flags);
+}
+
+static void iomap_iop_set_range_uptodate(struct inode *inode,
+		struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+
+	if (iop)
+		iop_set_range_uptodate(inode, folio, off, len);
+	else
+		folio_mark_uptodate(folio);
+}
+
 static struct iomap_page *iomap_iop_alloc(struct inode *inode,
 				struct folio *folio, unsigned int flags)
 {
@@ -58,12 +100,12 @@  static struct iomap_page *iomap_iop_alloc(struct inode *inode,
 	else
 		gfp = GFP_NOFS | __GFP_NOFAIL;
 
-	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
+	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
 		      gfp);
 	if (iop) {
-		spin_lock_init(&iop->uptodate_lock);
+		spin_lock_init(&iop->state_lock);
 		if (folio_test_uptodate(folio))
-			bitmap_fill(iop->uptodate, nr_blocks);
+			bitmap_fill(iop->state, nr_blocks);
 		folio_attach_private(folio, iop);
 	}
 	return iop;
@@ -72,14 +114,12 @@  static struct iomap_page *iomap_iop_alloc(struct inode *inode,
 static void iomap_iop_free(struct folio *folio)
 {
 	struct iomap_page *iop = to_iomap_page(folio);
-	struct inode *inode = folio->mapping->host;
-	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
 
 	if (!iop)
 		return;
 	WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
 	WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
-	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
+	WARN_ON_ONCE(iop_test_full_uptodate(folio) !=
 			folio_test_uptodate(folio));
 	folio_detach_private(folio);
 	kfree(iop);
@@ -111,7 +151,7 @@  static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
 
 		/* move forward for each leading block marked uptodate */
 		for (i = first; i <= last; i++) {
-			if (!test_bit(i, iop->uptodate))
+			if (!iop_test_block_uptodate(folio, i))
 				break;
 			*pos += block_size;
 			poff += block_size;
@@ -121,7 +161,7 @@  static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
 
 		/* truncate len if we find any trailing uptodate block(s) */
 		for ( ; i <= last; i++) {
-			if (test_bit(i, iop->uptodate)) {
+			if (iop_test_block_uptodate(folio, i)) {
 				plen -= (last - i + 1) * block_size;
 				last = i - 1;
 				break;
@@ -145,30 +185,6 @@  static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
 	*lenp = plen;
 }
 
-static void iomap_iop_set_range_uptodate(struct folio *folio,
-		struct iomap_page *iop, size_t off, size_t len)
-{
-	struct inode *inode = folio->mapping->host;
-	unsigned first = off >> inode->i_blkbits;
-	unsigned last = (off + len - 1) >> inode->i_blkbits;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iop->uptodate_lock, flags);
-	bitmap_set(iop->uptodate, first, last - first + 1);
-	if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
-		folio_mark_uptodate(folio);
-	spin_unlock_irqrestore(&iop->uptodate_lock, flags);
-}
-
-static void iomap_set_range_uptodate(struct folio *folio,
-		struct iomap_page *iop, size_t off, size_t len)
-{
-	if (iop)
-		iomap_iop_set_range_uptodate(folio, iop, off, len);
-	else
-		folio_mark_uptodate(folio);
-}
-
 static void iomap_finish_folio_read(struct folio *folio, size_t offset,
 		size_t len, int error)
 {
@@ -178,7 +194,8 @@  static void iomap_finish_folio_read(struct folio *folio, size_t offset,
 		folio_clear_uptodate(folio);
 		folio_set_error(folio);
 	} else {
-		iomap_set_range_uptodate(folio, iop, offset, len);
+		iomap_iop_set_range_uptodate(folio->mapping->host, folio,
+					     offset, len);
 	}
 
 	if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
@@ -214,7 +231,6 @@  struct iomap_readpage_ctx {
 static int iomap_read_inline_data(const struct iomap_iter *iter,
 		struct folio *folio)
 {
-	struct iomap_page *iop;
 	const struct iomap *iomap = iomap_iter_srcmap(iter);
 	size_t size = i_size_read(iter->inode) - iomap->offset;
 	size_t poff = offset_in_page(iomap->offset);
@@ -232,15 +248,14 @@  static int iomap_read_inline_data(const struct iomap_iter *iter,
 	if (WARN_ON_ONCE(size > iomap->length))
 		return -EIO;
 	if (offset > 0)
-		iop = iomap_iop_alloc(iter->inode, folio, iter->flags);
-	else
-		iop = to_iomap_page(folio);
+		iomap_iop_alloc(iter->inode, folio, iter->flags);
 
 	addr = kmap_local_folio(folio, offset);
 	memcpy(addr, iomap->inline_data, size);
 	memset(addr + size, 0, PAGE_SIZE - poff - size);
 	kunmap_local(addr);
-	iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
+	iomap_iop_set_range_uptodate(iter->inode, folio, offset,
+				     PAGE_SIZE - poff);
 	return 0;
 }
 
@@ -277,7 +292,7 @@  static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 
 	if (iomap_block_needs_zeroing(iter, pos)) {
 		folio_zero_range(folio, poff, plen);
-		iomap_set_range_uptodate(folio, iop, poff, plen);
+		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
 		goto done;
 	}
 
@@ -452,7 +467,7 @@  bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
 	last = (from + count - 1) >> inode->i_blkbits;
 
 	for (i = first; i <= last; i++)
-		if (!test_bit(i, iop->uptodate))
+		if (!iop_test_block_uptodate(folio, i))
 			return false;
 	return true;
 }
@@ -591,7 +606,7 @@  static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 			if (status)
 				return status;
 		}
-		iomap_set_range_uptodate(folio, iop, poff, plen);
+		iomap_iop_set_range_uptodate(iter->inode, folio, poff, plen);
 	} while ((block_start += plen) < block_end);
 
 	return 0;
@@ -698,7 +713,6 @@  static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 		size_t copied, struct folio *folio)
 {
-	struct iomap_page *iop = to_iomap_page(folio);
 	flush_dcache_folio(folio);
 
 	/*
@@ -714,7 +728,8 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	 */
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return 0;
-	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
+	iomap_iop_set_range_uptodate(inode, folio, offset_in_folio(folio, pos),
+				     len);
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return copied;
 }
@@ -1630,7 +1645,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * invalid, grab a new one.
 	 */
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (iop && !test_bit(i, iop->uptodate))
+		if (iop && !iop_test_block_uptodate(folio, i))
 			continue;
 
 		error = wpc->ops->map_blocks(wpc, inode, pos);