Message ID | 20240514152208.26935-1-jth@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | zonefs: move super block reading from page to folio | expand |
On Tue, May 14, 2024 at 05:22:08PM +0200, Johannes Thumshirn wrote: > From: Johannes Thumshirn <johannes.thumshirn@wdc.com> > > Move reading of the on-disk superblock from page to kmalloc()ed memory. > > Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Looks fine. Reviewed-by: Bill O'Donnell <bodonnel@redhat.com> > --- > fs/zonefs/super.c | 30 +++++++++++++++--------------- > 1 file changed, 15 insertions(+), 15 deletions(-) > > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c > index faf1eb87895d..ebea18da6759 100644 > --- a/fs/zonefs/super.c > +++ b/fs/zonefs/super.c > @@ -1111,28 +1111,28 @@ static int zonefs_read_super(struct super_block *sb) > struct zonefs_sb_info *sbi = ZONEFS_SB(sb); > struct zonefs_super *super; > u32 crc, stored_crc; > - struct page *page; > struct bio_vec bio_vec; > struct bio bio; > + struct folio *folio; > int ret; > > - page = alloc_page(GFP_KERNEL); > - if (!page) > + super = kzalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL); > + if (!super) > return -ENOMEM; > > + folio = virt_to_folio(super); > bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); > bio.bi_iter.bi_sector = 0; > - __bio_add_page(&bio, page, PAGE_SIZE, 0); > + bio_add_folio_nofail(&bio, folio, ZONEFS_SUPER_SIZE, > + offset_in_folio(folio, super)); > > ret = submit_bio_wait(&bio); > if (ret) > - goto free_page; > - > - super = page_address(page); > + goto free_super; > > ret = -EINVAL; > if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) > - goto free_page; > + goto free_super; > > stored_crc = le32_to_cpu(super->s_crc); > super->s_crc = 0; > @@ -1140,14 +1140,14 @@ static int zonefs_read_super(struct super_block *sb) > if (crc != stored_crc) { > zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", > crc, stored_crc); > - goto free_page; > + goto free_super; > } > > sbi->s_features = le64_to_cpu(super->s_features); > if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { > zonefs_err(sb, "Unknown features set 0x%llx\n", > sbi->s_features); > - goto free_page; > + goto free_super; > } > > if (sbi->s_features & ZONEFS_F_UID) { > @@ -1155,7 +1155,7 @@ static int zonefs_read_super(struct super_block *sb) > le32_to_cpu(super->s_uid)); > if (!uid_valid(sbi->s_uid)) { > zonefs_err(sb, "Invalid UID feature\n"); > - goto free_page; > + goto free_super; > } > } > > @@ -1164,7 +1164,7 @@ static int zonefs_read_super(struct super_block *sb) > le32_to_cpu(super->s_gid)); > if (!gid_valid(sbi->s_gid)) { > zonefs_err(sb, "Invalid GID feature\n"); > - goto free_page; > + goto free_super; > } > } > > @@ -1173,14 +1173,14 @@ static int zonefs_read_super(struct super_block *sb) > > if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { > zonefs_err(sb, "Reserved area is being used\n"); > - goto free_page; > + goto free_super; > } > > import_uuid(&sbi->s_uuid, super->s_uuid); > ret = 0; > > -free_page: > - __free_page(page); > +free_super: > + kfree(super); > > return ret; > } > -- > 2.35.3 > >
On Tue, May 14, 2024 at 05:22:08PM +0200, Johannes Thumshirn wrote: > From: Johannes Thumshirn <johannes.thumshirn@wdc.com> > > Move reading of the on-disk superblock from page to kmalloc()ed memory. No, this is wrong. > + super = kzalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL); > + if (!super) > return -ENOMEM; > > + folio = virt_to_folio(super); This will stop working at some point. It'll return NULL once we get to the memdesc future (because the memdesc will be a slab, not a folio). > bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); > bio.bi_iter.bi_sector = 0; > - __bio_add_page(&bio, page, PAGE_SIZE, 0); > + bio_add_folio_nofail(&bio, folio, ZONEFS_SUPER_SIZE, > + offset_in_folio(folio, super)); It also doesn't solve the problem of trying to read 4KiB from a device with 16KiB sectors. We'll have to fail the bio because there isn't enough memory in the bio to store one block. I think the right way to handle this is to call read_mapping_folio(). That will allocate a folio in the page cache for you (obeying the minimum folio size). Then you can examine the contents. It should actually remove code from zonefs. Don't forget to call folio_put() when you're done with it (either at unmount or at the end of mount if you copy what you need elsewhere).
On 23.05.24 03:42, Matthew Wilcox wrote: > On Tue, May 14, 2024 at 05:22:08PM +0200, Johannes Thumshirn wrote: >> From: Johannes Thumshirn <johannes.thumshirn@wdc.com> >> >> Move reading of the on-disk superblock from page to kmalloc()ed memory. > > No, this is wrong. > >> + super = kzalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL); >> + if (!super) >> return -ENOMEM; >> >> + folio = virt_to_folio(super); > > This will stop working at some point. It'll return NULL once we get > to the memdesc future (because the memdesc will be a slab, not a folio). > >> bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); >> bio.bi_iter.bi_sector = 0; >> - __bio_add_page(&bio, page, PAGE_SIZE, 0); >> + bio_add_folio_nofail(&bio, folio, ZONEFS_SUPER_SIZE, >> + offset_in_folio(folio, super)); > > It also doesn't solve the problem of trying to read 4KiB from a device > with 16KiB sectors. We'll have to fail the bio because there isn't > enough memory in the bio to store one block. > > I think the right way to handle this is to call read_mapping_folio(). > That will allocate a folio in the page cache for you (obeying the > minimum folio size). Then you can examine the contents. It should > actually remove code from zonefs. Don't forget to call folio_put() > when you're done with it (either at unmount or at the end of mount if > you copy what you need elsewhere). > Hmm but read mapping folio needs an inode for the address_space. Or does the block device inode work here?
On Thu, May 23, 2024 at 09:54:00AM +0000, Johannes Thumshirn wrote: > On 23.05.24 03:42, Matthew Wilcox wrote: > > I think the right way to handle this is to call read_mapping_folio(). > > That will allocate a folio in the page cache for you (obeying the > > minimum folio size). Then you can examine the contents. It should > > actually remove code from zonefs. Don't forget to call folio_put() > > when you're done with it (either at unmount or at the end of mount if > > you copy what you need elsewhere). > > Hmm but read mapping folio needs an inode for the address_space. Or does > the block device inode work here? Sorry, yes, should have been explicit. Read it using the bdev's address_space.
On 23.05.24 13:48, Matthew Wilcox wrote: > On Thu, May 23, 2024 at 09:54:00AM +0000, Johannes Thumshirn wrote: >> On 23.05.24 03:42, Matthew Wilcox wrote: >>> I think the right way to handle this is to call read_mapping_folio(). >>> That will allocate a folio in the page cache for you (obeying the >>> minimum folio size). Then you can examine the contents. It should >>> actually remove code from zonefs. Don't forget to call folio_put() >>> when you're done with it (either at unmount or at the end of mount if >>> you copy what you need elsewhere). >> >> Hmm but read mapping folio needs an inode for the address_space. Or does >> the block device inode work here? > > Sorry, yes, should have been explicit. Read it using the bdev's > address_space. > Ah OK then it's easy.
On Thu, May 23, 2024 at 02:41:51AM +0100, Matthew Wilcox wrote: > On Tue, May 14, 2024 at 05:22:08PM +0200, Johannes Thumshirn wrote: > > From: Johannes Thumshirn <johannes.thumshirn@wdc.com> > > > > Move reading of the on-disk superblock from page to kmalloc()ed memory. > > No, this is wrong. > > > + super = kzalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL); > > + if (!super) > > return -ENOMEM; > > > > + folio = virt_to_folio(super); > > This will stop working at some point. It'll return NULL once we get > to the memdesc future (because the memdesc will be a slab, not a folio). Hmmm, xfs_buf.c plays a similar trick here for sub-page buffers. I'm assuming that will get ported to ... whatever the memdesc future holds? > > bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); > > bio.bi_iter.bi_sector = 0; > > - __bio_add_page(&bio, page, PAGE_SIZE, 0); > > + bio_add_folio_nofail(&bio, folio, ZONEFS_SUPER_SIZE, > > + offset_in_folio(folio, super)); > > It also doesn't solve the problem of trying to read 4KiB from a device > with 16KiB sectors. We'll have to fail the bio because there isn't > enough memory in the bio to store one block. > > I think the right way to handle this is to call read_mapping_folio(). > That will allocate a folio in the page cache for you (obeying the > minimum folio size). Then you can examine the contents. It should > actually remove code from zonefs. Don't forget to call folio_put() > when you're done with it (either at unmount or at the end of mount if > you copy what you need elsewhere). The downside of using bd_mapping is that userspace can scribble all over the folio contents. For zonefs that's less of a big deal because it only reads it once, but for everyone else (e.g. ext4) it's been a huge problem. I guess you could always do max(ZONEFS_SUPER_SIZE, block_size(sb->s_bdev)) if you don't want to use the pagecache. <more mumbling about buffer caches> --D
On 5/31/24 10:16 AM, Darrick J. Wong wrote: > On Thu, May 23, 2024 at 02:41:51AM +0100, Matthew Wilcox wrote: >> On Tue, May 14, 2024 at 05:22:08PM +0200, Johannes Thumshirn wrote: >>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com> >>> >>> Move reading of the on-disk superblock from page to kmalloc()ed memory. >> >> No, this is wrong. >> >>> + super = kzalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL); >>> + if (!super) >>> return -ENOMEM; >>> >>> + folio = virt_to_folio(super); >> >> This will stop working at some point. It'll return NULL once we get >> to the memdesc future (because the memdesc will be a slab, not a folio). > > Hmmm, xfs_buf.c plays a similar trick here for sub-page buffers. I'm > assuming that will get ported to ... whatever the memdesc future holds? > >>> bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); >>> bio.bi_iter.bi_sector = 0; >>> - __bio_add_page(&bio, page, PAGE_SIZE, 0); >>> + bio_add_folio_nofail(&bio, folio, ZONEFS_SUPER_SIZE, >>> + offset_in_folio(folio, super)); >> >> It also doesn't solve the problem of trying to read 4KiB from a device >> with 16KiB sectors. We'll have to fail the bio because there isn't >> enough memory in the bio to store one block. >> >> I think the right way to handle this is to call read_mapping_folio(). >> That will allocate a folio in the page cache for you (obeying the >> minimum folio size). Then you can examine the contents. It should >> actually remove code from zonefs. Don't forget to call folio_put() >> when you're done with it (either at unmount or at the end of mount if >> you copy what you need elsewhere). > > The downside of using bd_mapping is that userspace can scribble all over > the folio contents. For zonefs that's less of a big deal because it > only reads it once, but for everyone else (e.g. ext4) it's been a huge Yes, and zonefs super block is read-only, we never update it after formatting. > problem. I guess you could always do max(ZONEFS_SUPER_SIZE, > block_size(sb->s_bdev)) if you don't want to use the pagecache. Good point. ZONEFS_SUPER_SIZE is 4K and given that I only know of 512e and 4K zoned block devices, this is not an issue yet. But better safe than sorry, so doing the max() thing you propose is better. Will patch that.
On Fri, May 31, 2024 at 10:28:50AM +0900, Damien Le Moal wrote: > >> This will stop working at some point. It'll return NULL once we get > >> to the memdesc future (because the memdesc will be a slab, not a folio). > > > > Hmmm, xfs_buf.c plays a similar trick here for sub-page buffers. I'm > > assuming that will get ported to ... whatever the memdesc future holds? I don't think it does, exactly? Are you referring to kmem_to_page()? That will continue to work. You're not trying to get a folio from a slab allocation; that will start to fail. > >> I think the right way to handle this is to call read_mapping_folio(). > >> That will allocate a folio in the page cache for you (obeying the > >> minimum folio size). Then you can examine the contents. It should > >> actually remove code from zonefs. Don't forget to call folio_put() > >> when you're done with it (either at unmount or at the end of mount if > >> you copy what you need elsewhere). > > > > The downside of using bd_mapping is that userspace can scribble all over > > the folio contents. For zonefs that's less of a big deal because it > > only reads it once, but for everyone else (e.g. ext4) it's been a huge > > Yes, and zonefs super block is read-only, we never update it after formatting. > > > problem. I guess you could always do max(ZONEFS_SUPER_SIZE, > > block_size(sb->s_bdev)) if you don't want to use the pagecache. > > Good point. ZONEFS_SUPER_SIZE is 4K and given that I only know of 512e and 4K > zoned block devices, this is not an issue yet. But better safe than sorry, so > doing the max() thing you propose is better. Will patch that. I think you should use read_mapping_folio() for now instead of complicating zonefs. Once there's a grand new buffer cache, switch to that, but I don't think you're introducing a significant vulnerability by using the block device's page cache.
On 6/2/24 02:51, Matthew Wilcox wrote: > On Fri, May 31, 2024 at 10:28:50AM +0900, Damien Le Moal wrote: >>>> This will stop working at some point. It'll return NULL once we get >>>> to the memdesc future (because the memdesc will be a slab, not a folio). >>> >>> Hmmm, xfs_buf.c plays a similar trick here for sub-page buffers. I'm >>> assuming that will get ported to ... whatever the memdesc future holds? > > I don't think it does, exactly? Are you referring to kmem_to_page()? > That will continue to work. You're not trying to get a folio from a > slab allocation; that will start to fail. > >>>> I think the right way to handle this is to call read_mapping_folio(). >>>> That will allocate a folio in the page cache for you (obeying the >>>> minimum folio size). Then you can examine the contents. It should >>>> actually remove code from zonefs. Don't forget to call folio_put() >>>> when you're done with it (either at unmount or at the end of mount if >>>> you copy what you need elsewhere). >>> >>> The downside of using bd_mapping is that userspace can scribble all over >>> the folio contents. For zonefs that's less of a big deal because it >>> only reads it once, but for everyone else (e.g. ext4) it's been a huge >> >> Yes, and zonefs super block is read-only, we never update it after formatting. >> >>> problem. I guess you could always do max(ZONEFS_SUPER_SIZE, >>> block_size(sb->s_bdev)) if you don't want to use the pagecache. >> >> Good point. ZONEFS_SUPER_SIZE is 4K and given that I only know of 512e and 4K >> zoned block devices, this is not an issue yet. But better safe than sorry, so >> doing the max() thing you propose is better. Will patch that. > > I think you should use read_mapping_folio() for now instead of > complicating zonefs. Once there's a grand new buffer cache, switch to > that, but I don't think you're introducing a significant vulnerability > by using the block device's page cache. I was not really thinking about vulnerability here, but rather compatibility with devices having a block size larger than 4K... But given that these are rare (at best), a fix for a more intelligent ZONEFS_SUPER_SIZE is not urgent, and not hard at all anyway.
On Sat, Jun 01, 2024 at 06:51:45PM +0100, Matthew Wilcox wrote: > On Fri, May 31, 2024 at 10:28:50AM +0900, Damien Le Moal wrote: > > >> This will stop working at some point. It'll return NULL once we get > > >> to the memdesc future (because the memdesc will be a slab, not a folio). > > > > > > Hmmm, xfs_buf.c plays a similar trick here for sub-page buffers. I'm > > > assuming that will get ported to ... whatever the memdesc future holds? > > I don't think it does, exactly? Are you referring to kmem_to_page()? > That will continue to work. You're not trying to get a folio from a > slab allocation; that will start to fail. The point is that we doing block I/O on a slab allocation is heavily used, and zonefs does this. If you dislike going through the folio we can just keep using pages in zonefs for now. Eventually I'll get around lifting the logic to greedily build a bio from arbitrary kernel virtual addresses from various places in XFS into common code and we can convert to that. > I think you should use read_mapping_folio() for now instead of > complicating zonefs. Once there's a grand new buffer cache, switch to > that, but I don't think you're introducing a significant vulnerability > by using the block device's page cache. Please don't add pointless uses of the bdev page cache for users that don't need caching. This just creates more headaches later on to untangle it again.
On 6/4/24 14:33, Christoph Hellwig wrote: > On Sat, Jun 01, 2024 at 06:51:45PM +0100, Matthew Wilcox wrote: >> On Fri, May 31, 2024 at 10:28:50AM +0900, Damien Le Moal wrote: >>>>> This will stop working at some point. It'll return NULL once we get >>>>> to the memdesc future (because the memdesc will be a slab, not a folio). >>>> >>>> Hmmm, xfs_buf.c plays a similar trick here for sub-page buffers. I'm >>>> assuming that will get ported to ... whatever the memdesc future holds? >> >> I don't think it does, exactly? Are you referring to kmem_to_page()? >> That will continue to work. You're not trying to get a folio from a >> slab allocation; that will start to fail. > > The point is that we doing block I/O on a slab allocation is heavily > used, and zonefs does this. If you dislike going through the folio > we can just keep using pages in zonefs for now. > > Eventually I'll get around lifting the logic to greedily build a bio > from arbitrary kernel virtual addresses from various places in XFS > into common code and we can convert to that. > >> I think you should use read_mapping_folio() for now instead of >> complicating zonefs. Once there's a grand new buffer cache, switch to >> that, but I don't think you're introducing a significant vulnerability >> by using the block device's page cache. > > Please don't add pointless uses of the bdev page cache for users that > don't need caching. This just creates more headaches later on to > untangle it again. OK. Will drop this patch then.
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index faf1eb87895d..ebea18da6759 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1111,28 +1111,28 @@ static int zonefs_read_super(struct super_block *sb) struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_super *super; u32 crc, stored_crc; - struct page *page; struct bio_vec bio_vec; struct bio bio; + struct folio *folio; int ret; - page = alloc_page(GFP_KERNEL); - if (!page) + super = kzalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL); + if (!super) return -ENOMEM; + folio = virt_to_folio(super); bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = 0; - __bio_add_page(&bio, page, PAGE_SIZE, 0); + bio_add_folio_nofail(&bio, folio, ZONEFS_SUPER_SIZE, + offset_in_folio(folio, super)); ret = submit_bio_wait(&bio); if (ret) - goto free_page; - - super = page_address(page); + goto free_super; ret = -EINVAL; if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) - goto free_page; + goto free_super; stored_crc = le32_to_cpu(super->s_crc); super->s_crc = 0; @@ -1140,14 +1140,14 @@ static int zonefs_read_super(struct super_block *sb) if (crc != stored_crc) { zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", crc, stored_crc); - goto free_page; + goto free_super; } sbi->s_features = le64_to_cpu(super->s_features); if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { zonefs_err(sb, "Unknown features set 0x%llx\n", sbi->s_features); - goto free_page; + goto free_super; } if (sbi->s_features & ZONEFS_F_UID) { @@ -1155,7 +1155,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_uid)); if (!uid_valid(sbi->s_uid)) { zonefs_err(sb, "Invalid UID feature\n"); - goto free_page; + goto free_super; } } @@ -1164,7 +1164,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_gid)); if (!gid_valid(sbi->s_gid)) { zonefs_err(sb, "Invalid GID feature\n"); - goto free_page; + goto free_super; } } @@ -1173,14 +1173,14 @@ static int zonefs_read_super(struct super_block *sb) if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { zonefs_err(sb, "Reserved area is being used\n"); - goto free_page; + goto free_super; } import_uuid(&sbi->s_uuid, super->s_uuid); ret = 0; -free_page: - __free_page(page); +free_super: + kfree(super); return ret; }