diff mbox series

[RFC] fs/hugetlb: Fix UBSAN warning reported on hugetlb

Message ID 20220908072659.259324-1-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series [RFC] fs/hugetlb: Fix UBSAN warning reported on hugetlb | expand

Commit Message

Aneesh Kumar K.V Sept. 8, 2022, 7:26 a.m. UTC
Powerpc architecture supports 16GB hugetlb pages with hash translation. For 4K
page size, this is implemented as a hugepage directory entry at the PGD level
and for 64K it is implemented as a huge page pte at the PUD level

Hugetlbfs sets up file system blocksize same as page size and this patch
switches blocks size usage with 16GB hugetlb to use size_t type.

We only change generic code and hugetlbfs related usage of i_blocksize(). Other
fs specific usage is left unchanged in this patch. A large part of this change
is not relevant to hugetlb, but it is changed to make sure we track block size
using size_t in generic code.

Only functionality w.r.t getattr is observed to be impacted by this change.

The below test shows the user-visible change.

 struct stat a;
 stat("/mnt/a", &a);
 printf("st_blksize = %ld\n", a.st_blksize);

Without patch
 # ./a.out  /mnt/a
 st_blksize = 0
 #

With patch
 # ./a.out /mnt/a
 st_blksize = 17179869184
 #

Statx still has the problem

 # stat /mnt/a
   File: /mnt/a
   Size: 0               Blocks: 0          IO Block: 512    regular empty file
 Device: 2eh/46d Inode: 74584       Links: 1
 Access: (0644/-rw-r--r--)  Uid: (    0/    root)   Gid: (    0/    root)
 Context: unconfined_u:object_r:hugetlbfs_t:s0
 Access: 2022-09-07 11:42:14.620239084 -0500
 Modify: 2022-09-07 11:42:14.620239084 -0500
 Change: 2022-09-07 11:42:14.620239084 -0500
  Birth: -

because it uses __u32 stx_blksize in uapi.

struct statx {
	/* 0x00 */
	__u32	stx_mask;	/* What results were written [uncond] */
	__u32	stx_blksize;	/* Preferred general I/O size [uncond] */

Fixing statx requires a syscall change where we add STATX_64BLOCKSIZE.

The change also fixes the below report warning.

 UBSAN: shift-out-of-bounds in ./include/linux/fs.h:709:12
 shift exponent 34 is too large for 32-bit type 'int'
 CPU: 67 PID: 1632 Comm: bash Not tainted 6.0.0-rc2-00327-gee88a56e8517-dirty #1
 Call Trace:
 [c000000021517990] [c000000000cb21e4] dump_stack_lvl+0x98/0xe0 (unreliable)
 [c0000000215179d0] [c000000000cacf60] ubsan_epilogue+0x18/0x70
 [c000000021517a30] [c000000000cac44c] __ubsan_handle_shift_out_of_bounds+0x1bc/0x390
 [c000000021517b30] [c00000000067e5b8] generic_fillattr+0x1b8/0x1d0
 [c000000021517b70] [c00000000067e6ec] vfs_getattr_nosec+0x11c/0x140
 [c000000021517bb0] [c00000000067e888] vfs_statx+0xd8/0x1d0
 [c000000021517c30] [c00000000067f658] vfs_fstatat+0x88/0xd0
 [c000000021517c80] [c00000000067f6e0] __do_sys_newstat+0x40/0x90
 [c000000021517d50] [c00000000003cde0] system_call_exception+0x250/0x600
 [c000000021517e10] [c00000000000c3bc] system_call_common+0xec/0x250

Cc: David Howells <dhowells@redhat.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 fs/buffer.c            | 6 +++---
 fs/dax.c               | 2 +-
 fs/iomap/buffered-io.c | 6 +++---
 fs/iomap/direct-io.c   | 2 +-
 include/linux/fs.h     | 4 ++--
 include/linux/stat.h   | 2 +-
 mm/truncate.c          | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

Comments

Matthew Wilcox (Oracle) Sept. 8, 2022, 4:53 p.m. UTC | #1
On Thu, Sep 08, 2022 at 12:56:59PM +0530, Aneesh Kumar K.V wrote:
> +++ b/fs/dax.c
> @@ -1304,7 +1304,7 @@ EXPORT_SYMBOL_GPL(dax_zero_range);
>  int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
>  		const struct iomap_ops *ops)
>  {
> -	unsigned int blocksize = i_blocksize(inode);
> +	size_t blocksize = i_blocksize(inode);
>  	unsigned int off = pos & (blocksize - 1);

If blocksize is larger than 4GB, then off also needs to be size_t.

> +++ b/fs/iomap/buffered-io.c
> @@ -955,7 +955,7 @@ int
>  iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
>  		const struct iomap_ops *ops)
>  {
> -	unsigned int blocksize = i_blocksize(inode);
> +	size_t blocksize = i_blocksize(inode);
>  	unsigned int off = pos & (blocksize - 1);

Ditto.

(maybe there are others; I didn't check closely)
Aneesh Kumar K.V Sept. 8, 2022, 4:59 p.m. UTC | #2
On 9/8/22 10:23 PM, Matthew Wilcox wrote:
> On Thu, Sep 08, 2022 at 12:56:59PM +0530, Aneesh Kumar K.V wrote:
>> +++ b/fs/dax.c
>> @@ -1304,7 +1304,7 @@ EXPORT_SYMBOL_GPL(dax_zero_range);
>>  int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
>>  		const struct iomap_ops *ops)
>>  {
>> -	unsigned int blocksize = i_blocksize(inode);
>> +	size_t blocksize = i_blocksize(inode);
>>  	unsigned int off = pos & (blocksize - 1);
> 
> If blocksize is larger than 4GB, then off also needs to be size_t.
> 
>> +++ b/fs/iomap/buffered-io.c
>> @@ -955,7 +955,7 @@ int
>>  iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
>>  		const struct iomap_ops *ops)
>>  {
>> -	unsigned int blocksize = i_blocksize(inode);
>> +	size_t blocksize = i_blocksize(inode);
>>  	unsigned int off = pos & (blocksize - 1);
> 
> Ditto.
> 
> (maybe there are others; I didn't check closely)

Thanks. will check those. 

Any feedback on statx? Should we really fix that?

I am still not clear why we chose to set blocksize = pagesize for hugetlbfs.
Was that done to enable application find the hugetlb pagesize via stat()? 

-aneesh
Aristeu Rozanski Oct. 26, 2022, 2:50 p.m. UTC | #3
On Thu, Sep 08, 2022 at 10:29:59PM +0530, Aneesh Kumar K V wrote:
> On 9/8/22 10:23 PM, Matthew Wilcox wrote:
> > On Thu, Sep 08, 2022 at 12:56:59PM +0530, Aneesh Kumar K.V wrote:
> >> +++ b/fs/dax.c
> >> @@ -1304,7 +1304,7 @@ EXPORT_SYMBOL_GPL(dax_zero_range);
> >>  int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> >>  		const struct iomap_ops *ops)
> >>  {
> >> -	unsigned int blocksize = i_blocksize(inode);
> >> +	size_t blocksize = i_blocksize(inode);
> >>  	unsigned int off = pos & (blocksize - 1);
> > 
> > If blocksize is larger than 4GB, then off also needs to be size_t.
> > 
> >> +++ b/fs/iomap/buffered-io.c
> >> @@ -955,7 +955,7 @@ int
> >>  iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> >>  		const struct iomap_ops *ops)
> >>  {
> >> -	unsigned int blocksize = i_blocksize(inode);
> >> +	size_t blocksize = i_blocksize(inode);
> >>  	unsigned int off = pos & (blocksize - 1);
> > 
> > Ditto.
> > 
> > (maybe there are others; I didn't check closely)
> 
> Thanks. will check those. 
> 
> Any feedback on statx? Should we really fix that?
> 
> I am still not clear why we chose to set blocksize = pagesize for hugetlbfs.
> Was that done to enable application find the hugetlb pagesize via stat()? 

I'd like to know that as well. It'd be easier to just limit the hugetlbfs max
blocksize to 4GB. It's very unlikely anything else will use such large
blocksizes and having to introduce new user interfaces for it doesn't sound
right.
Mike Kravetz Oct. 26, 2022, 6:11 p.m. UTC | #4
On 10/26/22 10:50, Aristeu Rozanski wrote:
> On Thu, Sep 08, 2022 at 10:29:59PM +0530, Aneesh Kumar K V wrote:
> > On 9/8/22 10:23 PM, Matthew Wilcox wrote:
> > > On Thu, Sep 08, 2022 at 12:56:59PM +0530, Aneesh Kumar K.V wrote:
> > >> +++ b/fs/dax.c
> > >> @@ -1304,7 +1304,7 @@ EXPORT_SYMBOL_GPL(dax_zero_range);
> > >>  int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> > >>  		const struct iomap_ops *ops)
> > >>  {
> > >> -	unsigned int blocksize = i_blocksize(inode);
> > >> +	size_t blocksize = i_blocksize(inode);
> > >>  	unsigned int off = pos & (blocksize - 1);
> > > 
> > > If blocksize is larger than 4GB, then off also needs to be size_t.
> > > 
> > >> +++ b/fs/iomap/buffered-io.c
> > >> @@ -955,7 +955,7 @@ int
> > >>  iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> > >>  		const struct iomap_ops *ops)
> > >>  {
> > >> -	unsigned int blocksize = i_blocksize(inode);
> > >> +	size_t blocksize = i_blocksize(inode);
> > >>  	unsigned int off = pos & (blocksize - 1);
> > > 
> > > Ditto.
> > > 
> > > (maybe there are others; I didn't check closely)
> > 
> > Thanks. will check those. 
> > 
> > Any feedback on statx? Should we really fix that?
> > 
> > I am still not clear why we chose to set blocksize = pagesize for hugetlbfs.
> > Was that done to enable application find the hugetlb pagesize via stat()? 
> 
> I'd like to know that as well. It'd be easier to just limit the hugetlbfs max
> blocksize to 4GB. It's very unlikely anything else will use such large
> blocksizes and having to introduce new user interfaces for it doesn't sound
> right.

I was not around hugetlbfs when the decision was made to set 'blocksize =
pagesize'.  However, I must say that it does seem to make sense as you
can only add or remove entire hugetlb pages from a hugetlbfs file.  So,
the hugetlb page size does seem to correspond to the meaning of filesystem
blocksize.

Does any application code make use of this?  I can not make a guess.
diff mbox series

Patch

diff --git a/fs/buffer.c b/fs/buffer.c
index 55e762a58eb6..15def791325e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2376,7 +2376,7 @@  static int cont_expand_zero(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	const struct address_space_operations *aops = mapping->a_ops;
-	unsigned int blocksize = i_blocksize(inode);
+	size_t blocksize = i_blocksize(inode);
 	struct page *page;
 	void *fsdata;
 	pgoff_t index, curidx;
@@ -2454,7 +2454,7 @@  int cont_write_begin(struct file *file, struct address_space *mapping,
 			get_block_t *get_block, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
-	unsigned int blocksize = i_blocksize(inode);
+	size_t blocksize = i_blocksize(inode);
 	unsigned int zerofrom;
 	int err;
 
@@ -2542,7 +2542,7 @@  int block_truncate_page(struct address_space *mapping,
 {
 	pgoff_t index = from >> PAGE_SHIFT;
 	unsigned offset = from & (PAGE_SIZE-1);
-	unsigned blocksize;
+	size_t blocksize;
 	sector_t iblock;
 	unsigned length, pos;
 	struct inode *inode = mapping->host;
diff --git a/fs/dax.c b/fs/dax.c
index c440dcef4b1b..66673ef56695 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1304,7 +1304,7 @@  EXPORT_SYMBOL_GPL(dax_zero_range);
 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops)
 {
-	unsigned int blocksize = i_blocksize(inode);
+	size_t blocksize = i_blocksize(inode);
 	unsigned int off = pos & (blocksize - 1);
 
 	/* Block boundary? Nothing to do */
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ca5c62901541..4b67018c6e71 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -955,7 +955,7 @@  int
 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops)
 {
-	unsigned int blocksize = i_blocksize(inode);
+	size_t blocksize = i_blocksize(inode);
 	unsigned int off = pos & (blocksize - 1);
 
 	/* Block boundary? Nothing to do */
@@ -1297,7 +1297,7 @@  iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
 		struct writeback_control *wbc, struct list_head *iolist)
 {
 	sector_t sector = iomap_sector(&wpc->iomap, pos);
-	unsigned len = i_blocksize(inode);
+	size_t len = i_blocksize(inode);
 	size_t poff = offset_in_folio(folio, pos);
 
 	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
@@ -1340,7 +1340,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 {
 	struct iomap_page *iop = iomap_page_create(inode, folio, 0);
 	struct iomap_ioend *ioend, *next;
-	unsigned len = i_blocksize(inode);
+	size_t len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
 	u64 pos = folio_pos(folio);
 	int error = 0, count = 0, i;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4eb559a16c9e..d17d9e11cd35 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -241,7 +241,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
 	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	size_t fs_block_size = i_blocksize(inode), pad;
 	loff_t length = iomap_length(iter);
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..7fedf9dbcac3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -704,9 +704,9 @@  struct inode {
 
 struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);
 
-static inline unsigned int i_blocksize(const struct inode *node)
+static inline size_t i_blocksize(const struct inode *node)
 {
-	return (1 << node->i_blkbits);
+	return (1UL << node->i_blkbits);
 }
 
 static inline int inode_unhashed(struct inode *inode)
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..f362a8d1af0c 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -23,7 +23,7 @@  struct kstat {
 	u32		result_mask;	/* What fields the user got */
 	umode_t		mode;
 	unsigned int	nlink;
-	uint32_t	blksize;	/* Preferred I/O size */
+	size_t		blksize;	/* Preferred I/O size */
 	u64		attributes;
 	u64		attributes_mask;
 #define KSTAT_ATTR_FS_IOC_FLAGS				\
diff --git a/mm/truncate.c b/mm/truncate.c
index 0b0708bf935f..9d4b298d4f83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -800,7 +800,7 @@  EXPORT_SYMBOL(truncate_setsize);
  */
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
 {
-	int bsize = i_blocksize(inode);
+	size_t bsize = i_blocksize(inode);
 	loff_t rounded_from;
 	struct page *page;
 	pgoff_t index;