diff mbox series

[V5,2/3] sd_zbc: Fix report zones buffer allocation

Message ID 20190627092944.20957-3-damien.lemoal@wdc.com (mailing list archive)
State New, archived
Headers show
Series Fix zone revalidation memory allocation failures | expand

Commit Message

Damien Le Moal June 27, 2019, 9:29 a.m. UTC
During disk scan and revalidation done with sd_revalidate(), the zones
of a zoned disk are checked using the helper function
blk_revalidate_disk_zones() if a configuration change is detected
(change in the number of zones or zone size). The function
blk_revalidate_disk_zones() issues report_zones calls that are very
large, that is, to obtain zone information for all zones of the disk
with a single command. The size of the report zones command buffer
necessary for such large request generally is lower than the disk
max_hw_sectors and KMALLOC_MAX_SIZE (4MB) and succeeds on boot (no
memory fragmentation), but often fail at run time (e.g. hot-plug
event). This causes the disk revalidation to fail and the disk
capacity to be changed to 0.

This problem can be avoided by using vmalloc() instead of kmalloc() for
the buffer allocation. To limit the amount of memory to be allocated,
this patch also introduces the arbitrary SD_ZBC_REPORT_MAX_ZONES
maximum number of zones to report with a single report zones command.
This limit may be lowered further to satisfy the disk max_hw_sectors
limit. Finally, to ensure that the vmalloc-ed buffer can always be
mapped in a request, the buffer size is further limited to at most
queue_max_segments() pages, allowing successful mapping of the buffer
even in the worst case scenario where none of the buffer pages are
contiguous.

Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 drivers/scsi/sd_zbc.c | 83 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 21 deletions(-)

Comments

Christoph Hellwig June 27, 2019, 2:09 p.m. UTC | #1
> @@ -79,6 +80,7 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
>  	put_unaligned_be32(buflen, &cmd[10]);
>  	if (partial)
>  		cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
> +
>  	memset(buf, 0, buflen);
>  
>  	result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,

Spurious whitespace change.

> +static void *sd_zbc_alloc_report_buffer(struct request_queue *q,
> +					unsigned int nr_zones, size_t *buflen,
> +					gfp_t gfp_mask)
> +{
> +	size_t bufsize;
> +	void *buf;
> +
> +	/*
> +	 * Report zone buffer size should be at most 64B times the number of
> +	 * zones requested plus the 64B reply header, but should be at least
> +	 * SECTOR_SIZE for ATA devices.
> +	 * Make sure that this size does not exceed the hardware capabilities.
> +	 * Furthermore, since the report zone command cannot be split, make
> +	 * sure that the allocated buffer can always be mapped by limiting the
> +	 * number of pages allocated to the HBA max segments limit.
> +	 */
> +	nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
> +	bufsize = roundup((nr_zones + 1) * 64, 512);
> +	bufsize = min_t(size_t, bufsize,
> +			queue_max_hw_sectors(q) << SECTOR_SHIFT);
> +	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
> +
> +	buf = __vmalloc(bufsize, gfp_mask, PAGE_KERNEL);

__vmalloc is odd in that it takes a gfp parameter, but can't actually
use it for the page table allocations.  So you'll need to do
memalloc_noio_save here, or even better do that in the block layer and
remove the gfp_t parameter from ->report_zones.
Damien Le Moal June 28, 2019, 7:30 a.m. UTC | #2
On 6/27/19 11:09 PM, Christoph Hellwig wrote:
>> @@ -79,6 +80,7 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
>>  	put_unaligned_be32(buflen, &cmd[10]);
>>  	if (partial)
>>  		cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
>> +
>>  	memset(buf, 0, buflen);
>>  
>>  	result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
> 
> Spurious whitespace change.

Fixed.

>> +static void *sd_zbc_alloc_report_buffer(struct request_queue *q,
>> +					unsigned int nr_zones, size_t *buflen,
>> +					gfp_t gfp_mask)
>> +{
>> +	size_t bufsize;
>> +	void *buf;
>> +
>> +	/*
>> +	 * Report zone buffer size should be at most 64B times the number of
>> +	 * zones requested plus the 64B reply header, but should be at least
>> +	 * SECTOR_SIZE for ATA devices.
>> +	 * Make sure that this size does not exceed the hardware capabilities.
>> +	 * Furthermore, since the report zone command cannot be split, make
>> +	 * sure that the allocated buffer can always be mapped by limiting the
>> +	 * number of pages allocated to the HBA max segments limit.
>> +	 */
>> +	nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
>> +	bufsize = roundup((nr_zones + 1) * 64, 512);
>> +	bufsize = min_t(size_t, bufsize,
>> +			queue_max_hw_sectors(q) << SECTOR_SHIFT);
>> +	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
>> +
>> +	buf = __vmalloc(bufsize, gfp_mask, PAGE_KERNEL);
> 
> __vmalloc is odd in that it takes a gfp parameter, but can't actually
> use it for the page table allocations.  So you'll need to do
> memalloc_noio_save here, or even better do that in the block layer and
> remove the gfp_t parameter from ->report_zones.

Yes, indeed. However, removing the gfp_flags from report_zones method
would limit possibilities to only GFP_NOIO or GFP_KERNEL (default
vmalloc). What if the caller is an FS and needs GFP_NOFS, or any other
reclaim flags ? Handling all possibilities does not seem reasonable.
Handling only GFP_KERNEL and GFP_IO is easy, but that would mean that
the caller of blkdev_report_zones would need to do itself calls to
whatever  memalloc_noXX_save/restore() it needs. Is that OK ?

Currently, blkdev_report_zones() uses only either GFP_KERNEL (general
case, fs, dm and user ioctl), or GFP_NOIO for revalidate, disk scan and
dm-zoned error path. So removing the flag from the report zones method
while keeping it in the block layer API to distinguished these cases is
simple, but I am not sure if that will not pause problems for some
users. Thoughts ?
Christoph Hellwig June 28, 2019, 7:44 a.m. UTC | #3
On Fri, Jun 28, 2019 at 07:30:49AM +0000, Damien Le Moal wrote:
> 
> Yes, indeed. However, removing the gfp_flags from report_zones method
> would limit possibilities to only GFP_NOIO or GFP_KERNEL (default
> vmalloc). What if the caller is an FS and needs GFP_NOFS, or any other
> reclaim flags ? Handling all possibilities does not seem reasonable.
> Handling only GFP_KERNEL and GFP_IO is easy, but that would mean that
> the caller of blkdev_report_zones would need to do itself calls to
> whatever  memalloc_noXX_save/restore() it needs. Is that OK ?

I think it is ok.  The only real possibily is noio anyway as far as
I can tell.

> 
> Currently, blkdev_report_zones() uses only either GFP_KERNEL (general
> case, fs, dm and user ioctl), or GFP_NOIO for revalidate, disk scan and
> dm-zoned error path. So removing the flag from the report zones method
> while keeping it in the block layer API to distinguished these cases is
> simple, but I am not sure if that will not pause problems for some
> users. Thoughts ?

I'd kill it from the block layer API and require the caller to set
the per-task flag.  If I understood the mm maintainers correctly the
long term plan is to kill of GFP_NOFS and GFP_NOIO flowly and just rely
on the contexts.
Damien Le Moal June 28, 2019, 7:57 a.m. UTC | #4
On 6/28/19 4:44 PM, Christoph Hellwig wrote:
> On Fri, Jun 28, 2019 at 07:30:49AM +0000, Damien Le Moal wrote:
>>
>> Yes, indeed. However, removing the gfp_flags from report_zones method
>> would limit possibilities to only GFP_NOIO or GFP_KERNEL (default
>> vmalloc). What if the caller is an FS and needs GFP_NOFS, or any other
>> reclaim flags ? Handling all possibilities does not seem reasonable.
>> Handling only GFP_KERNEL and GFP_IO is easy, but that would mean that
>> the caller of blkdev_report_zones would need to do itself calls to
>> whatever  memalloc_noXX_save/restore() it needs. Is that OK ?
> 
> I think it is ok.  The only real possibily is noio anyway as far as
> I can tell.
> 
>>
>> Currently, blkdev_report_zones() uses only either GFP_KERNEL (general
>> case, fs, dm and user ioctl), or GFP_NOIO for revalidate, disk scan and
>> dm-zoned error path. So removing the flag from the report zones method
>> while keeping it in the block layer API to distinguished these cases is
>> simple, but I am not sure if that will not pause problems for some
>> users. Thoughts ?
> 
> I'd kill it from the block layer API and require the caller to set
> the per-task flag.  If I understood the mm maintainers correctly the
> long term plan is to kill of GFP_NOFS and GFP_NOIO flowly and just rely
> on the contexts.

OK. Got it. Easy then.
However, doing everything in this patch will make the patch quite big as
nullblk and dm also need changes. Should I kill the gfp_mask argument in
a separate patch before this one ?
Christoph Hellwig June 28, 2019, 8:03 a.m. UTC | #5
On Fri, Jun 28, 2019 at 07:57:38AM +0000, Damien Le Moal wrote:
> However, doing everything in this patch will make the patch quite big as
> nullblk and dm also need changes. Should I kill the gfp_mask argument in
> a separate patch before this one ?

Sounds good to me.
diff mbox series

Patch

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7334024b64f1..ecd967fb39c1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -9,6 +9,7 @@ 
  */
 
 #include <linux/blkdev.h>
+#include <linux/vmalloc.h>
 
 #include <asm/unaligned.h>
 
@@ -50,7 +51,7 @@  static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
 /**
  * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
  * @sdkp: The target disk
- * @buf: Buffer to use for the reply
+ * @buf: vmalloc-ed buffer to use for the reply
  * @buflen: the buffer size
  * @lba: Start LBA of the report
  * @partial: Do partial report
@@ -79,6 +80,7 @@  static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
 	put_unaligned_be32(buflen, &cmd[10]);
 	if (partial)
 		cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
+
 	memset(buf, 0, buflen);
 
 	result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
@@ -103,6 +105,48 @@  static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
 	return 0;
 }
 
+/*
+ * Maximum number of zones to get with one report zones command.
+ */
+#define SD_ZBC_REPORT_MAX_ZONES		8192U
+
+/**
+ * Allocate a buffer for report zones reply.
+ * @disk: The target disk
+ * @nr_zones: Maximum number of zones to report
+ * @buflen: Size of the buffer allocated
+ * @gfp_mask: Memory allocation mask
+ *
+ */
+static void *sd_zbc_alloc_report_buffer(struct request_queue *q,
+					unsigned int nr_zones, size_t *buflen,
+					gfp_t gfp_mask)
+{
+	size_t bufsize;
+	void *buf;
+
+	/*
+	 * Report zone buffer size should be at most 64B times the number of
+	 * zones requested plus the 64B reply header, but should be at least
+	 * SECTOR_SIZE for ATA devices.
+	 * Make sure that this size does not exceed the hardware capabilities.
+	 * Furthermore, since the report zone command cannot be split, make
+	 * sure that the allocated buffer can always be mapped by limiting the
+	 * number of pages allocated to the HBA max segments limit.
+	 */
+	nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
+	bufsize = roundup((nr_zones + 1) * 64, 512);
+	bufsize = min_t(size_t, bufsize,
+			queue_max_hw_sectors(q) << SECTOR_SHIFT);
+	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+	buf = __vmalloc(bufsize, gfp_mask, PAGE_KERNEL);
+	if (buf)
+		*buflen = bufsize;
+
+	return buf;
+}
+
 /**
  * sd_zbc_report_zones - Disk report zones operation.
  * @disk: The target disk
@@ -118,9 +162,9 @@  int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 			gfp_t gfp_mask)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
-	unsigned int i, buflen, nrz = *nr_zones;
+	unsigned int i, nrz = *nr_zones;
 	unsigned char *buf;
-	size_t offset = 0;
+	size_t buflen = 0, offset = 0;
 	int ret = 0;
 
 	if (!sd_is_zoned(sdkp))
@@ -132,16 +176,14 @@  int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 	 * without exceeding the device maximum command size. For ATA disks,
 	 * buffers must be aligned to 512B.
 	 */
-	buflen = min(queue_max_hw_sectors(disk->queue) << 9,
-		     roundup((nrz + 1) * 64, 512));
-	buf = kmalloc(buflen, gfp_mask);
+	buf = sd_zbc_alloc_report_buffer(disk->queue, nrz, &buflen, gfp_mask);
 	if (!buf)
 		return -ENOMEM;
 
 	ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
 			sectors_to_logical(sdkp->device, sector), true);
 	if (ret)
-		goto out_free_buf;
+		goto out;
 
 	nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
 	for (i = 0; i < nrz; i++) {
@@ -152,8 +194,8 @@  int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 
 	*nr_zones = nrz;
 
-out_free_buf:
-	kfree(buf);
+out:
+	kvfree(buf);
 
 	return ret;
 }
@@ -287,8 +329,6 @@  static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
 	return 0;
 }
 
-#define SD_ZBC_BUF_SIZE 131072U
-
 /**
  * sd_zbc_check_zones - Check the device capacity and zone sizes
  * @sdkp: Target disk
@@ -304,22 +344,23 @@  static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
  */
 static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
 {
+	size_t bufsize, buflen;
 	u64 zone_blocks = 0;
 	sector_t max_lba, block = 0;
 	unsigned char *buf;
 	unsigned char *rec;
-	unsigned int buf_len;
-	unsigned int list_length;
 	int ret;
 	u8 same;
 
 	/* Get a buffer */
-	buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+	buf = sd_zbc_alloc_report_buffer(sdkp->disk->queue,
+					 SD_ZBC_REPORT_MAX_ZONES,
+					 &bufsize, GFP_NOIO);
 	if (!buf)
 		return -ENOMEM;
 
 	/* Do a report zone to get max_lba and the same field */
-	ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false);
+	ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
 	if (ret)
 		goto out_free;
 
@@ -355,12 +396,12 @@  static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
 	do {
 
 		/* Parse REPORT ZONES header */
-		list_length = get_unaligned_be32(&buf[0]) + 64;
+		buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
+			       bufsize);
 		rec = buf + 64;
-		buf_len = min(list_length, SD_ZBC_BUF_SIZE);
 
 		/* Parse zone descriptors */
-		while (rec < buf + buf_len) {
+		while (rec < buf + buflen) {
 			u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
 
 			if (zone_blocks == 0) {
@@ -376,8 +417,8 @@  static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
 		}
 
 		if (block < sdkp->capacity) {
-			ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
-						     block, true);
+			ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
+						     true);
 			if (ret)
 				goto out_free;
 		}
@@ -408,7 +449,7 @@  static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
 	}
 
 out_free:
-	kfree(buf);
+	kvfree(buf);
 
 	return ret;
 }