Message ID | 20230614114637.89759-5-hare@suse.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | RFC: high-order folio support for I/O | expand |
On Wed, Jun 14, 2023 at 01:46:34PM +0200, Hannes Reinecke wrote: > @@ -43,9 +43,11 @@ struct brd_device { > */ > struct xarray brd_folios; > u64 brd_nr_folios; > + unsigned int brd_sector_shift; > + unsigned int brd_sector_size; > }; > > -#define BRD_SECTOR_SHIFT(b) (PAGE_SHIFT - SECTOR_SHIFT) > +#define BRD_SECTOR_SHIFT(b) ((b)->brd_sector_shift - SECTOR_SHIFT) > > static pgoff_t brd_sector_index(struct brd_device *brd, sector_t sector) > { > @@ -85,7 +87,7 @@ static int brd_insert_folio(struct brd_device *brd, sector_t sector, gfp_t gfp) > { > pgoff_t idx; > struct folio *folio, *cur; > - unsigned int rd_sector_order = get_order(PAGE_SIZE); > + unsigned int rd_sector_order = get_order(brd->brd_sector_size); Surely max(0, brd->brd_sector_shift - PAGE_SHIFT) ? > @@ -346,6 +353,25 @@ static int brd_alloc(int i) > return -ENOMEM; > brd->brd_number = i; > list_add_tail(&brd->brd_list, &brd_devices); > + brd->brd_sector_shift = ilog2(rd_blksize); > + if ((1ULL << brd->brd_sector_shift) != rd_blksize) { > + pr_err("rd_blksize %d is not supported\n", rd_blksize); Are you trying to require power-of-two here? We have is_power_of_2() for that purpose.
On 6/14/23 14:55, Matthew Wilcox wrote: > On Wed, Jun 14, 2023 at 01:46:34PM +0200, Hannes Reinecke wrote: >> @@ -43,9 +43,11 @@ struct brd_device { >> */ >> struct xarray brd_folios; >> u64 brd_nr_folios; >> + unsigned int brd_sector_shift; >> + unsigned int brd_sector_size; >> }; >> >> -#define BRD_SECTOR_SHIFT(b) (PAGE_SHIFT - SECTOR_SHIFT) >> +#define BRD_SECTOR_SHIFT(b) ((b)->brd_sector_shift - SECTOR_SHIFT) >> >> static pgoff_t brd_sector_index(struct brd_device *brd, sector_t sector) >> { >> @@ -85,7 +87,7 @@ static int brd_insert_folio(struct brd_device *brd, sector_t sector, gfp_t gfp) >> { >> pgoff_t idx; >> struct folio *folio, *cur; >> - unsigned int rd_sector_order = get_order(PAGE_SIZE); >> + unsigned int rd_sector_order = get_order(brd->brd_sector_size); > > Surely max(0, brd->brd_sector_shift - PAGE_SHIFT) ? > Errm. Possibly. >> @@ -346,6 +353,25 @@ static int brd_alloc(int i) >> return -ENOMEM; >> brd->brd_number = i; >> list_add_tail(&brd->brd_list, &brd_devices); >> + brd->brd_sector_shift = ilog2(rd_blksize); >> + if ((1ULL << brd->brd_sector_shift) != rd_blksize) { >> + pr_err("rd_blksize %d is not supported\n", rd_blksize); > > Are you trying to require power-of-two here? We have is_power_of_2() > for that purpose. > Ah. So let's use that, then :-) Cheers, Hannes
On Wed, Jun 14, 2023 at 01:46:34PM +0200, Hannes Reinecke wrote: > @@ -310,6 +312,10 @@ static int max_part = 1; > module_param(max_part, int, 0444); > MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); > > +static unsigned int rd_blksize = PAGE_SIZE; > +module_param(rd_blksize, uint, 0444); > +MODULE_PARM_DESC(rd_blksize, "Blocksize of each RAM disk in bytes."); This needs CONFIG_BLK_DEV_RAM_BLOCK_SIZE to set the default size for those of us who don't use modular kernels.... Cheers, Dave.
On Thu, Jun 15, 2023 at 12:17:12PM +1000, Dave Chinner wrote: > On Wed, Jun 14, 2023 at 01:46:34PM +0200, Hannes Reinecke wrote: > > @@ -310,6 +312,10 @@ static int max_part = 1; > > module_param(max_part, int, 0444); > > MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); > > > > +static unsigned int rd_blksize = PAGE_SIZE; > > +module_param(rd_blksize, uint, 0444); > > +MODULE_PARM_DESC(rd_blksize, "Blocksize of each RAM disk in bytes."); > > This needs CONFIG_BLK_DEV_RAM_BLOCK_SIZE to set the default size > for those of us who don't use modular kernels.... You can set module parameter on the command line for built-in code like brd.rd_blksize=8196 While we're at it, why that weird rd_ prefix for the parameter?
On 6/15/23 04:17, Dave Chinner wrote: > On Wed, Jun 14, 2023 at 01:46:34PM +0200, Hannes Reinecke wrote: >> @@ -310,6 +312,10 @@ static int max_part = 1; >> module_param(max_part, int, 0444); >> MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); >> >> +static unsigned int rd_blksize = PAGE_SIZE; >> +module_param(rd_blksize, uint, 0444); >> +MODULE_PARM_DESC(rd_blksize, "Blocksize of each RAM disk in bytes."); > > This needs CONFIG_BLK_DEV_RAM_BLOCK_SIZE to set the default size > for those of us who don't use modular kernels.... > Ok, will do. Cheers, Hannes
On 6/15/23 07:55, Christoph Hellwig wrote: > On Thu, Jun 15, 2023 at 12:17:12PM +1000, Dave Chinner wrote: >> On Wed, Jun 14, 2023 at 01:46:34PM +0200, Hannes Reinecke wrote: >>> @@ -310,6 +312,10 @@ static int max_part = 1; >>> module_param(max_part, int, 0444); >>> MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); >>> >>> +static unsigned int rd_blksize = PAGE_SIZE; >>> +module_param(rd_blksize, uint, 0444); >>> +MODULE_PARM_DESC(rd_blksize, "Blocksize of each RAM disk in bytes."); >> >> This needs CONFIG_BLK_DEV_RAM_BLOCK_SIZE to set the default size >> for those of us who don't use modular kernels.... > > You can set module parameter on the command line for built-in code > like brd.rd_blksize=8196 > > While we're at it, why that weird rd_ prefix for the parameter? > Because that's what's used for all the existing parameters, too. We can remove it, though, but then we either have inconsistent naming (some parameters with 'rd_', others without), or break existing setups. Cheers, Hannes
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 71d3d8af8b0d..2ebb5532a204 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -30,7 +30,7 @@ /* * Each block ramdisk device has a xarray of folios that stores the folios * containing the block device's contents. A brd folio's ->index is its offset - * in PAGE_SIZE units. This is similar to, but in no way connected with, + * in brd_sector_size units. This is similar to, but in no way connected with, * the kernel's pagecache or buffer cache (which sit above our block device). */ struct brd_device { @@ -43,9 +43,11 @@ struct brd_device { */ struct xarray brd_folios; u64 brd_nr_folios; + unsigned int brd_sector_shift; + unsigned int brd_sector_size; }; -#define BRD_SECTOR_SHIFT(b) (PAGE_SHIFT - SECTOR_SHIFT) +#define BRD_SECTOR_SHIFT(b) ((b)->brd_sector_shift - SECTOR_SHIFT) static pgoff_t brd_sector_index(struct brd_device *brd, sector_t sector) { @@ -85,7 +87,7 @@ static int brd_insert_folio(struct brd_device *brd, sector_t sector, gfp_t gfp) { pgoff_t idx; struct folio *folio, *cur; - unsigned int rd_sector_order = get_order(PAGE_SIZE); + unsigned int rd_sector_order = get_order(brd->brd_sector_size); int ret = 0; folio = brd_lookup_folio(brd, sector); @@ -140,7 +142,7 @@ static void brd_free_folios(struct brd_device *brd) static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n, gfp_t gfp) { - unsigned int rd_sector_size = PAGE_SIZE; + unsigned int rd_sector_size = brd->brd_sector_size; unsigned int offset = brd_sector_offset(brd, sector); size_t copy; int ret; @@ -164,7 +166,7 @@ static void copy_to_brd(struct brd_device *brd, const void *src, { struct folio *folio; void *dst; - unsigned int rd_sector_size = PAGE_SIZE; + unsigned int rd_sector_size = brd->brd_sector_size; unsigned int offset = brd_sector_offset(brd, sector); size_t copy; @@ -197,7 +199,7 @@ static void copy_from_brd(void *dst, struct brd_device *brd, { struct folio *folio; void *src; - unsigned int rd_sector_size = PAGE_SIZE; + unsigned int rd_sector_size = brd->brd_sector_size; unsigned int offset = brd_sector_offset(brd, sector); size_t copy; @@ -310,6 +312,10 @@ static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); +static unsigned int rd_blksize = PAGE_SIZE; +module_param(rd_blksize, uint, 0444); +MODULE_PARM_DESC(rd_blksize, "Blocksize of each RAM disk in bytes."); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -336,6 +342,7 @@ static int brd_alloc(int i) struct brd_device *brd; struct gendisk *disk; char buf[DISK_NAME_LEN]; + unsigned int rd_max_sectors; int err = -ENOMEM; list_for_each_entry(brd, &brd_devices, brd_list) @@ -346,6 +353,25 @@ static int brd_alloc(int i) return -ENOMEM; brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); + brd->brd_sector_shift = ilog2(rd_blksize); + if ((1ULL << brd->brd_sector_shift) != rd_blksize) { + pr_err("rd_blksize %d is not supported\n", rd_blksize); + err = -EINVAL; + goto out_free_dev; + } + if (rd_blksize < SECTOR_SIZE) { + pr_err("rd_blksize must be at least 512 bytes\n"); + err = -EINVAL; + goto out_free_dev; + } + /* We can't allocate more than MAX_ORDER pages */ + rd_max_sectors = (1ULL << MAX_ORDER) << BRD_SECTOR_SHIFT(brd); + if (rd_blksize > rd_max_sectors) { + pr_err("rd_blocksize too large\n"); + err = -EINVAL; + goto out_free_dev; + } + brd->brd_sector_size = rd_blksize; xa_init(&brd->brd_folios); @@ -365,15 +391,9 @@ static int brd_alloc(int i) disk->private_data = brd; strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - - /* - * This is so fdisk will align partitions on 4k, because of - * direct_access API needing 4k alignment, returning a PFN - * (This is only a problem on very small devices <= 4M, - * otherwise fdisk will align on 1M. Regardless this call - * is harmless) - */ - blk_queue_physical_block_size(disk->queue, PAGE_SIZE); + + blk_queue_physical_block_size(disk->queue, rd_blksize); + blk_queue_max_hw_sectors(disk->queue, 1ULL << (MAX_ORDER + PAGE_SECTORS_SHIFT)); /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
Add a module option 'rd_blksize' to allow the user to change the sector size of the RAM disks. Signed-off-by: Hannes Reinecke <hare@suse.de> --- drivers/block/brd.c | 50 +++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 15 deletions(-)