diff mbox series

[V3,13/13] HV/Storvsc: Add Isolation VM support for storvsc driver

Message ID 20210809175620.720923-14-ltykernel@gmail.com (mailing list archive)
State Not Applicable
Headers show
Series x86/Hyper-V: Add Hyper-V Isolation VM support | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Tianyu Lan Aug. 9, 2021, 5:56 p.m. UTC
From: Tianyu Lan <Tianyu.Lan@microsoft.com>

In Isolation VM, all shared memory with host needs to mark visible
to host via hvcall. vmbus_establish_gpadl() has already done it for
storvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
mpb_desc() still need to handle. Use DMA API to map/umap these
memory during sending/receiving packet and Hyper-V DMA ops callback
will use swiotlb function to allocate bounce buffer and copy data
from/to bounce buffer.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
 drivers/scsi/storvsc_drv.c | 68 +++++++++++++++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 5 deletions(-)

Comments

Michael Kelley (LINUX) Aug. 19, 2021, 6:17 p.m. UTC | #1
From: Tianyu Lan <ltykernel@gmail.com> Sent: Monday, August 9, 2021 10:56 AM
> 

Subject line tag should be "scsi: storvsc:"

> In Isolation VM, all shared memory with host needs to mark visible
> to host via hvcall. vmbus_establish_gpadl() has already done it for
> storvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
> mpb_desc() still need to handle. Use DMA API to map/umap these

s/need to handle/needs to be handled/

> memory during sending/receiving packet and Hyper-V DMA ops callback
> will use swiotlb function to allocate bounce buffer and copy data
> from/to bounce buffer.
> 
> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
> ---
>  drivers/scsi/storvsc_drv.c | 68 +++++++++++++++++++++++++++++++++++---
>  1 file changed, 63 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index 328bb961c281..78320719bdd8 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -21,6 +21,8 @@
>  #include <linux/device.h>
>  #include <linux/hyperv.h>
>  #include <linux/blkdev.h>
> +#include <linux/io.h>
> +#include <linux/dma-mapping.h>
>  #include <scsi/scsi.h>
>  #include <scsi/scsi_cmnd.h>
>  #include <scsi/scsi_host.h>
> @@ -427,6 +429,8 @@ struct storvsc_cmd_request {
>  	u32 payload_sz;
> 
>  	struct vstor_packet vstor_packet;
> +	u32 hvpg_count;

This count is really the number of entries in the dma_range
array, right?  If so, perhaps "dma_range_count" would be
a better name so that it is more tightly associated.

> +	struct hv_dma_range *dma_range;
>  };
> 
> 
> @@ -509,6 +513,14 @@ struct storvsc_scan_work {
>  	u8 tgt_id;
>  };
> 
> +#define storvsc_dma_map(dev, page, offset, size, dir) \
> +	dma_map_page(dev, page, offset, size, dir)
> +
> +#define storvsc_dma_unmap(dev, dma_range, dir)		\
> +		dma_unmap_page(dev, dma_range.dma,	\
> +			       dma_range.mapping_size,	\
> +			       dir ? DMA_FROM_DEVICE : DMA_TO_DEVICE)
> +

Each of these macros is used only once.  IMHO, they don't
add a lot of value.  Just coding dma_map/unmap_page()
inline would be fine and eliminate these lines of code.

>  static void storvsc_device_scan(struct work_struct *work)
>  {
>  	struct storvsc_scan_work *wrk;
> @@ -1260,6 +1272,7 @@ static void storvsc_on_channel_callback(void *context)
>  	struct hv_device *device;
>  	struct storvsc_device *stor_device;
>  	struct Scsi_Host *shost;
> +	int i;
> 
>  	if (channel->primary_channel != NULL)
>  		device = channel->primary_channel->device_obj;
> @@ -1314,6 +1327,15 @@ static void storvsc_on_channel_callback(void *context)
>  				request = (struct storvsc_cmd_request *)scsi_cmd_priv(scmnd);
>  			}
> 
> +			if (request->dma_range) {
> +				for (i = 0; i < request->hvpg_count; i++)
> +					storvsc_dma_unmap(&device->device,
> +						request->dma_range[i],
> +						request->vstor_packet.vm_srb.data_in == READ_TYPE);

I think you can directly get the DMA direction as request->cmd->sc_data_direction.

> +
> +				kfree(request->dma_range);
> +			}
> +
>  			storvsc_on_receive(stor_device, packet, request);
>  			continue;
>  		}
> @@ -1810,7 +1832,9 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>  		unsigned int hvpgoff, hvpfns_to_add;
>  		unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
>  		unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
> +		dma_addr_t dma;
>  		u64 hvpfn;
> +		u32 size;
> 
>  		if (hvpg_count > MAX_PAGE_BUFFER_COUNT) {
> 
> @@ -1824,6 +1848,13 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>  		payload->range.len = length;
>  		payload->range.offset = offset_in_hvpg;
> 
> +		cmd_request->dma_range = kcalloc(hvpg_count,
> +				 sizeof(*cmd_request->dma_range),
> +				 GFP_ATOMIC);

With this patch, it appears that storvsc_queuecommand() is always
doing bounce buffering, even when running in a non-isolated VM.
The dma_range is always allocated, and the inner loop below does
the dma mapping for every I/O page.  The corresponding code in
storvsc_on_channel_callback() that does the dma unmap allows for
the dma_range to be NULL, but that never happens.

> +		if (!cmd_request->dma_range) {
> +			ret = -ENOMEM;

The other memory allocation failure in this function returns
SCSI_MLQUEUE_DEVICE_BUSY.   It may be debatable as to whether
that's the best approach, but that's a topic for a different patch.  I
would suggest being consistent and using the same return code
here.

> +			goto free_payload;
> +		}
> 
>  		for (i = 0; sgl != NULL; sgl = sg_next(sgl)) {
>  			/*
> @@ -1847,9 +1878,29 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>  			 * last sgl should be reached at the same time that
>  			 * the PFN array is filled.
>  			 */
> -			while (hvpfns_to_add--)
> -				payload->range.pfn_array[i++] =	hvpfn++;
> +			while (hvpfns_to_add--) {
> +				size = min(HV_HYP_PAGE_SIZE - offset_in_hvpg,
> +					   (unsigned long)length);
> +				dma = storvsc_dma_map(&dev->device, pfn_to_page(hvpfn++),
> +						      offset_in_hvpg, size,
> +						      scmnd->sc_data_direction);
> +				if (dma_mapping_error(&dev->device, dma)) {
> +					ret = -ENOMEM;

The typical error from dma_map_page() will be running out of
bounce buffer memory.   This is a transient condition that should be
retried at the higher levels.  So make sure to return an error code
that indicates the I/O should be resubmitted.

> +					goto free_dma_range;
> +				}
> +
> +				if (offset_in_hvpg) {
> +					payload->range.offset = dma & ~HV_HYP_PAGE_MASK;
> +					offset_in_hvpg = 0;
> +				}

I'm not clear on why payload->range.offset needs to be set again.
Even after the dma mapping is done, doesn't the offset in the first
page have to be the same?  If it wasn't the same, Hyper-V wouldn't
be able to process the PFN list correctly.  In fact, couldn't the above
code just always set offset_in_hvpg = 0?

> +
> +				cmd_request->dma_range[i].dma = dma;
> +				cmd_request->dma_range[i].mapping_size = size;
> +				payload->range.pfn_array[i++] = dma >> HV_HYP_PAGE_SHIFT;
> +				length -= size;
> +			}
>  		}
> +		cmd_request->hvpg_count = hvpg_count;

This line just saves the size of the dma_range array.  Could
it be moved up with the code that allocates the dma_range
array?  To me, it would make more sense to have all that
code together in one place.

>  	}

The whole approach here is to do dma remapping on each individual page
of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
each scatterlist entry as a unit?  Each scatterlist entry describes a range of
physically contiguous memory.  After dma_map_sg(), the resulting dma
address must also refer to a physically contiguous range in the swiotlb
bounce buffer memory.   So at the top of the "for" loop over the scatterlist
entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
hvpfn value based on the dma address instead of sg_page().  But everything
else is the same, and the inner loop for populating the pfn_arry is unmodified.
Furthermore, the dma_range array that you've added is not needed, since
scatterlist entries already have a dma_address field for saving the mapped
address, and dma_unmap_sg() uses that field.

One thing:  There's a maximum swiotlb mapping size, which I think works
out to be 256 Kbytes.  See swiotlb_max_mapping_size().  We need to make
sure that we don't get a scatterlist entry bigger than this size.  But I think
this already happens because you set the device->dma_mask field in
Patch 11 of this series.  __scsi_init_queue checks for this setting and
sets max_sectors to limits transfers to the max mapping size.

> 
>  	cmd_request->payload = payload;
> @@ -1860,13 +1911,20 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>  	put_cpu();
> 
>  	if (ret == -EAGAIN) {
> -		if (payload_sz > sizeof(cmd_request->mpb))
> -			kfree(payload);
>  		/* no more space */
> -		return SCSI_MLQUEUE_DEVICE_BUSY;
> +		ret = SCSI_MLQUEUE_DEVICE_BUSY;
> +		goto free_dma_range;
>  	}
> 
>  	return 0;
> +
> +free_dma_range:
> +	kfree(cmd_request->dma_range);
> +
> +free_payload:
> +	if (payload_sz > sizeof(cmd_request->mpb))
> +		kfree(payload);
> +	return ret;
>  }
> 
>  static struct scsi_host_template scsi_driver = {
> --
> 2.25.1
Christoph Hellwig Aug. 20, 2021, 4:32 a.m. UTC | #2
On Thu, Aug 19, 2021 at 06:17:40PM +0000, Michael Kelley wrote:
> > +#define storvsc_dma_map(dev, page, offset, size, dir) \
> > +	dma_map_page(dev, page, offset, size, dir)
> > +
> > +#define storvsc_dma_unmap(dev, dma_range, dir)		\
> > +		dma_unmap_page(dev, dma_range.dma,	\
> > +			       dma_range.mapping_size,	\
> > +			       dir ? DMA_FROM_DEVICE : DMA_TO_DEVICE)
> > +
> 
> Each of these macros is used only once.  IMHO, they don't
> add a lot of value.  Just coding dma_map/unmap_page()
> inline would be fine and eliminate these lines of code.

Yes, I had the same thought when looking over the code.  Especially
as macros tend to further obsfucate the code (compared to actual helper
functions).

> > +				for (i = 0; i < request->hvpg_count; i++)
> > +					storvsc_dma_unmap(&device->device,
> > +						request->dma_range[i],
> > +						request->vstor_packet.vm_srb.data_in == READ_TYPE);
> 
> I think you can directly get the DMA direction as request->cmd->sc_data_direction.

Yes.

> > 
> > @@ -1824,6 +1848,13 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
> >  		payload->range.len = length;
> >  		payload->range.offset = offset_in_hvpg;
> > 
> > +		cmd_request->dma_range = kcalloc(hvpg_count,
> > +				 sizeof(*cmd_request->dma_range),
> > +				 GFP_ATOMIC);
> 
> With this patch, it appears that storvsc_queuecommand() is always
> doing bounce buffering, even when running in a non-isolated VM.
> The dma_range is always allocated, and the inner loop below does
> the dma mapping for every I/O page.  The corresponding code in
> storvsc_on_channel_callback() that does the dma unmap allows for
> the dma_range to be NULL, but that never happens.

Maybe I'm missing something in the hyperv code, but I don't think
dma_map_page would bounce buffer for the non-isolated case.  It
will just return the physical address.

> > +		if (!cmd_request->dma_range) {
> > +			ret = -ENOMEM;
> 
> The other memory allocation failure in this function returns
> SCSI_MLQUEUE_DEVICE_BUSY.   It may be debatable as to whether
> that's the best approach, but that's a topic for a different patch.  I
> would suggest being consistent and using the same return code
> here.

Independent of if SCSI_MLQUEUE_DEVICE_BUSY is good (it it a common
pattern in SCSI drivers), ->queuecommand can't return normal
negative errnos.  It must return the SCSI_MLQUEUE_* codes or 0.
We should probably change the return type of the method definition
to a suitable enum to make this more clear.

> > +				if (offset_in_hvpg) {
> > +					payload->range.offset = dma & ~HV_HYP_PAGE_MASK;
> > +					offset_in_hvpg = 0;
> > +				}
> 
> I'm not clear on why payload->range.offset needs to be set again.
> Even after the dma mapping is done, doesn't the offset in the first
> page have to be the same?  If it wasn't the same, Hyper-V wouldn't
> be able to process the PFN list correctly.  In fact, couldn't the above
> code just always set offset_in_hvpg = 0?

Careful.  DMA mapping is supposed to keep the offset in the page, but
for that the DMA mapping code needs to know what the device considers a
"page".  For that the driver needs to set the min_align_mask field in
struct device_dma_parameters.

> 
> The whole approach here is to do dma remapping on each individual page
> of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
> each scatterlist entry as a unit?  Each scatterlist entry describes a range of
> physically contiguous memory.  After dma_map_sg(), the resulting dma
> address must also refer to a physically contiguous range in the swiotlb
> bounce buffer memory.   So at the top of the "for" loop over the scatterlist
> entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
> hvpfn value based on the dma address instead of sg_page().  But everything
> else is the same, and the inner loop for populating the pfn_arry is unmodified.
> Furthermore, the dma_range array that you've added is not needed, since
> scatterlist entries already have a dma_address field for saving the mapped
> address, and dma_unmap_sg() uses that field.

Yes, I think dma_map_sg is the right thing to use here, probably even
for the non-isolated case so that we can get the hv drivers out of their
little corner and into being more like a normal kernel driver.  That
is, use the scsi_dma_map/scsi_dma_unmap helpers, and then iterate over
the dma addresses one page at a time using for_each_sg_dma_page.

> 
> One thing:  There's a maximum swiotlb mapping size, which I think works
> out to be 256 Kbytes.  See swiotlb_max_mapping_size().  We need to make
> sure that we don't get a scatterlist entry bigger than this size.  But I think
> this already happens because you set the device->dma_mask field in
> Patch 11 of this series.  __scsi_init_queue checks for this setting and
> sets max_sectors to limits transfers to the max mapping size.

Indeed.
Tianyu Lan Aug. 20, 2021, 3:20 p.m. UTC | #3
On 8/20/2021 2:17 AM, Michael Kelley wrote:
> From: Tianyu Lan <ltykernel@gmail.com> Sent: Monday, August 9, 2021 10:56 AM
>>
> 
> Subject line tag should be "scsi: storvsc:"
> 
>> In Isolation VM, all shared memory with host needs to mark visible
>> to host via hvcall. vmbus_establish_gpadl() has already done it for
>> storvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
>> mpb_desc() still need to handle. Use DMA API to map/umap these
> 
> s/need to handle/needs to be handled/
> 
>> memory during sending/receiving packet and Hyper-V DMA ops callback
>> will use swiotlb function to allocate bounce buffer and copy data
>> from/to bounce buffer.
>>
>> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
>> ---
>>   drivers/scsi/storvsc_drv.c | 68 +++++++++++++++++++++++++++++++++++---
>>   1 file changed, 63 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
>> index 328bb961c281..78320719bdd8 100644
>> --- a/drivers/scsi/storvsc_drv.c
>> +++ b/drivers/scsi/storvsc_drv.c
>> @@ -21,6 +21,8 @@
>>   #include <linux/device.h>
>>   #include <linux/hyperv.h>
>>   #include <linux/blkdev.h>
>> +#include <linux/io.h>
>> +#include <linux/dma-mapping.h>
>>   #include <scsi/scsi.h>
>>   #include <scsi/scsi_cmnd.h>
>>   #include <scsi/scsi_host.h>
>> @@ -427,6 +429,8 @@ struct storvsc_cmd_request {
>>   	u32 payload_sz;
>>
>>   	struct vstor_packet vstor_packet;
>> +	u32 hvpg_count;
> 
> This count is really the number of entries in the dma_range
> array, right?  If so, perhaps "dma_range_count" would be
> a better name so that it is more tightly associated.

Yes, will update.

> 
>> +	struct hv_dma_range *dma_range;
>>   };
>>
>>
>> @@ -509,6 +513,14 @@ struct storvsc_scan_work {
>>   	u8 tgt_id;
>>   };
>>
>> +#define storvsc_dma_map(dev, page, offset, size, dir) \
>> +	dma_map_page(dev, page, offset, size, dir)
>> +
>> +#define storvsc_dma_unmap(dev, dma_range, dir)		\
>> +		dma_unmap_page(dev, dma_range.dma,	\
>> +			       dma_range.mapping_size,	\
>> +			       dir ? DMA_FROM_DEVICE : DMA_TO_DEVICE)
>> +
> 
> Each of these macros is used only once.  IMHO, they don't
> add a lot of value.  Just coding dma_map/unmap_page()
> inline would be fine and eliminate these lines of code.

OK. Will update.

> 
>>   static void storvsc_device_scan(struct work_struct *work)
>>   {
>>   	struct storvsc_scan_work *wrk;
>> @@ -1260,6 +1272,7 @@ static void storvsc_on_channel_callback(void *context)
>>   	struct hv_device *device;
>>   	struct storvsc_device *stor_device;
>>   	struct Scsi_Host *shost;
>> +	int i;
>>
>>   	if (channel->primary_channel != NULL)
>>   		device = channel->primary_channel->device_obj;
>> @@ -1314,6 +1327,15 @@ static void storvsc_on_channel_callback(void *context)
>>   				request = (struct storvsc_cmd_request *)scsi_cmd_priv(scmnd);
>>   			}
>>
>> +			if (request->dma_range) {
>> +				for (i = 0; i < request->hvpg_count; i++)
>> +					storvsc_dma_unmap(&device->device,
>> +						request->dma_range[i],
>> +						request->vstor_packet.vm_srb.data_in == READ_TYPE);
> 
> I think you can directly get the DMA direction as request->cmd->sc_data_direction.
> 
>> +
>> +				kfree(request->dma_range);
>> +			}
>> +
>>   			storvsc_on_receive(stor_device, packet, request);
>>   			continue;
>>   		}
>> @@ -1810,7 +1832,9 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>>   		unsigned int hvpgoff, hvpfns_to_add;
>>   		unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
>>   		unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
>> +		dma_addr_t dma;
>>   		u64 hvpfn;
>> +		u32 size;
>>
>>   		if (hvpg_count > MAX_PAGE_BUFFER_COUNT) {
>>
>> @@ -1824,6 +1848,13 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>>   		payload->range.len = length;
>>   		payload->range.offset = offset_in_hvpg;
>>
>> +		cmd_request->dma_range = kcalloc(hvpg_count,
>> +				 sizeof(*cmd_request->dma_range),
>> +				 GFP_ATOMIC);
> 
> With this patch, it appears that storvsc_queuecommand() is always
> doing bounce buffering, even when running in a non-isolated VM.

In the non-isolated VM, SWIOTLB_FORCE mode isn't enabled and so
the swiotlb bounce buffer will not work.

> The dma_range is always allocated, and the inner loop below does
> the dma mapping for every I/O page.  The corresponding code in
> storvsc_on_channel_callback() that does the dma unmap allows for
> the dma_range to be NULL, but that never happens.

Yes, dma mapping function will return PA directly in non-isolated VM.

> 
>> +		if (!cmd_request->dma_range) {
>> +			ret = -ENOMEM;
> 
> The other memory allocation failure in this function returns
> SCSI_MLQUEUE_DEVICE_BUSY.   It may be debatable as to whether
> that's the best approach, but that's a topic for a different patch.  I
> would suggest being consistent and using the same return code
> here.

OK. I will keep to return SCSI_MLQUEUE_DEVICE_BUSY here.

> 
>> +			goto free_payload;
>> +		}
>>
>>   		for (i = 0; sgl != NULL; sgl = sg_next(sgl)) {
>>   			/*
>> @@ -1847,9 +1878,29 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>>   			 * last sgl should be reached at the same time that
>>   			 * the PFN array is filled.
>>   			 */
>> -			while (hvpfns_to_add--)
>> -				payload->range.pfn_array[i++] =	hvpfn++;
>> +			while (hvpfns_to_add--) {
>> +				size = min(HV_HYP_PAGE_SIZE - offset_in_hvpg,
>> +					   (unsigned long)length);
>> +				dma = storvsc_dma_map(&dev->device, pfn_to_page(hvpfn++),
>> +						      offset_in_hvpg, size,
>> +						      scmnd->sc_data_direction);
>> +				if (dma_mapping_error(&dev->device, dma)) {
>> +					ret = -ENOMEM;
> 
> The typical error from dma_map_page() will be running out of
> bounce buffer memory.   This is a transient condition that should be
> retried at the higher levels.  So make sure to return an error code
> that indicates the I/O should be resubmitted.

OK. It looks like error code should be SCSI_MLQUEUE_DEVICE_BUSY here.

> 
>> +					goto free_dma_range;
>> +				}
>> +
>> +				if (offset_in_hvpg) {
>> +					payload->range.offset = dma & ~HV_HYP_PAGE_MASK;
>> +					offset_in_hvpg = 0;
>> +				}
> 
> I'm not clear on why payload->range.offset needs to be set again.
> Even after the dma mapping is done, doesn't the offset in the first
> page have to be the same?  If it wasn't the same, Hyper-V wouldn't
> be able to process the PFN list correctly.  In fact, couldn't the above
> code just always set offset_in_hvpg = 0?

The offset will be changed. The swiotlb bounce buffer is allocated with 
IO_TLB_SIZE(2K) as unit. So the offset here may be changed.

> 
>> +
>> +				cmd_request->dma_range[i].dma = dma;
>> +				cmd_request->dma_range[i].mapping_size = size;
>> +				payload->range.pfn_array[i++] = dma >> HV_HYP_PAGE_SHIFT;
>> +				length -= size;
>> +			}
>>   		}
>> +		cmd_request->hvpg_count = hvpg_count;
> 
> This line just saves the size of the dma_range array.  Could
> it be moved up with the code that allocates the dma_range
> array?  To me, it would make more sense to have all that
> code together in one place.

Sure. Will update.

> 
>>   	}
> 
> The whole approach here is to do dma remapping on each individual page
> of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
> each scatterlist entry as a unit?  Each scatterlist entry describes a range of
> physically contiguous memory.  After dma_map_sg(), the resulting dma
> address must also refer to a physically contiguous range in the swiotlb
> bounce buffer memory.   So at the top of the "for" loop over the scatterlist
> entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
> hvpfn value based on the dma address instead of sg_page().  But everything
> else is the same, and the inner loop for populating the pfn_arry is unmodified.
> Furthermore, the dma_range array that you've added is not needed, since
> scatterlist entries already have a dma_address field for saving the mapped
> address, and dma_unmap_sg() uses that field.

I don't use dma_map_sg() here in order to avoid introducing one more 
loop(e,g dma_map_sg()). We already have a loop to populate 
cmd_request->dma_range[] and so do the dma map in the same loop.

> 
> One thing:  There's a maximum swiotlb mapping size, which I think works
> out to be 256 Kbytes.  See swiotlb_max_mapping_size().  We need to make
> sure that we don't get a scatterlist entry bigger than this size.  But I think
> this already happens because you set the device->dma_mask field in
> Patch 11 of this series.  __scsi_init_queue checks for this setting and
> sets max_sectors to limits transfers to the max mapping size.

I will double check.

> 
>>
>>   	cmd_request->payload = payload;
>> @@ -1860,13 +1911,20 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>>   	put_cpu();
>>
>>   	if (ret == -EAGAIN) {
>> -		if (payload_sz > sizeof(cmd_request->mpb))
>> -			kfree(payload);
>>   		/* no more space */
>> -		return SCSI_MLQUEUE_DEVICE_BUSY;
>> +		ret = SCSI_MLQUEUE_DEVICE_BUSY;
>> +		goto free_dma_range;
>>   	}
>>
>>   	return 0;
>> +
>> +free_dma_range:
>> +	kfree(cmd_request->dma_range);
>> +
>> +free_payload:
>> +	if (payload_sz > sizeof(cmd_request->mpb))
>> +		kfree(payload);
>> +	return ret;
>>   }
>>
>>   static struct scsi_host_template scsi_driver = {
>> --
>> 2.25.1
>
Tianyu Lan Aug. 20, 2021, 3:37 p.m. UTC | #4
On 8/20/2021 11:20 PM, Tianyu Lan wrote:
>> The whole approach here is to do dma remapping on each individual page
>> of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to 
>> map
>> each scatterlist entry as a unit?  Each scatterlist entry describes a 
>> range of
>> physically contiguous memory.  After dma_map_sg(), the resulting dma
>> address must also refer to a physically contiguous range in the swiotlb
>> bounce buffer memory.   So at the top of the "for" loop over the 
>> scatterlist
>> entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
>> hvpfn value based on the dma address instead of sg_page().  But 
>> everything
>> else is the same, and the inner loop for populating the pfn_arry is 
>> unmodified.
>> Furthermore, the dma_range array that you've added is not needed, since
>> scatterlist entries already have a dma_address field for saving the 
>> mapped
>> address, and dma_unmap_sg() uses that field.
> 
> I don't use dma_map_sg() here in order to avoid introducing one more 
> loop(e,g dma_map_sg()). We already have a loop to populate 
> cmd_request->dma_range[] and so do the dma map in the same loop.

Sorry for a typo. s/cmd_request->dma_range[]/payload->range.pfn_array[]/
Michael Kelley (LINUX) Aug. 20, 2021, 3:40 p.m. UTC | #5
From: hch@lst.de <hch@lst.de> Sent: Thursday, August 19, 2021 9:33 PM
> 
> On Thu, Aug 19, 2021 at 06:17:40PM +0000, Michael Kelley wrote:
> > >
> > > @@ -1824,6 +1848,13 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
> > >  		payload->range.len = length;
> > >  		payload->range.offset = offset_in_hvpg;
> > >
> > > +		cmd_request->dma_range = kcalloc(hvpg_count,
> > > +				 sizeof(*cmd_request->dma_range),
> > > +				 GFP_ATOMIC);
> >
> > With this patch, it appears that storvsc_queuecommand() is always
> > doing bounce buffering, even when running in a non-isolated VM.
> > The dma_range is always allocated, and the inner loop below does
> > the dma mapping for every I/O page.  The corresponding code in
> > storvsc_on_channel_callback() that does the dma unmap allows for
> > the dma_range to be NULL, but that never happens.
> 
> Maybe I'm missing something in the hyperv code, but I don't think
> dma_map_page would bounce buffer for the non-isolated case.  It
> will just return the physical address.

OK, right.  In the isolated VM case, the swiotlb is in force mode
and will do bounce buffering.  In the non-isolated case,
dma_map_page_attrs() -> dma_direct_map_page() does a lot of
checking but eventually just returns the physical address.  As this
patch is currently coded, it adds a fair amount of overhead
here in storvsc_queuecommand(), plus the overhead of the dma
mapping function deciding to use the identity mapping.  But if
dma_map_sg() is used and the code is simplified a bit, the overhead
will be less in general and will be per sgl entry instead of per page.

> 
> > > +				if (offset_in_hvpg) {
> > > +					payload->range.offset = dma & ~HV_HYP_PAGE_MASK;
> > > +					offset_in_hvpg = 0;
> > > +				}
> >
> > I'm not clear on why payload->range.offset needs to be set again.
> > Even after the dma mapping is done, doesn't the offset in the first
> > page have to be the same?  If it wasn't the same, Hyper-V wouldn't
> > be able to process the PFN list correctly.  In fact, couldn't the above
> > code just always set offset_in_hvpg = 0?
> 
> Careful.  DMA mapping is supposed to keep the offset in the page, but
> for that the DMA mapping code needs to know what the device considers a
> "page".  For that the driver needs to set the min_align_mask field in
> struct device_dma_parameters.
> 

I see that the swiotlb code gets and uses the min_align_mask field.  But
the NVME driver is the only driver that ever sets it, so the value is zero
in all other cases.  Does swiotlb just use PAGE_SIZE in that that case?  I
couldn't tell from a quick glance at the swiotlb code.

> >
> > The whole approach here is to do dma remapping on each individual page
> > of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
> > each scatterlist entry as a unit?  Each scatterlist entry describes a range of
> > physically contiguous memory.  After dma_map_sg(), the resulting dma
> > address must also refer to a physically contiguous range in the swiotlb
> > bounce buffer memory.   So at the top of the "for" loop over the scatterlist
> > entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
> > hvpfn value based on the dma address instead of sg_page().  But everything
> > else is the same, and the inner loop for populating the pfn_arry is unmodified.
> > Furthermore, the dma_range array that you've added is not needed, since
> > scatterlist entries already have a dma_address field for saving the mapped
> > address, and dma_unmap_sg() uses that field.
> 
> Yes, I think dma_map_sg is the right thing to use here, probably even
> for the non-isolated case so that we can get the hv drivers out of their
> little corner and into being more like a normal kernel driver.  That
> is, use the scsi_dma_map/scsi_dma_unmap helpers, and then iterate over
> the dma addresses one page at a time using for_each_sg_dma_page.
> 

Doing some broader revisions to the Hyper-V storvsc driver is up next on
my to-do list.  Rather than significantly modifying the non-isolated case in
this patch set, I'd suggest factoring it into my broader revisions.

Michael
Tianyu Lan Aug. 20, 2021, 4:01 p.m. UTC | #6
On 8/20/2021 12:32 PM, hch@lst.de wrote:
> On Thu, Aug 19, 2021 at 06:17:40PM +0000, Michael Kelley wrote:
>>> +#define storvsc_dma_map(dev, page, offset, size, dir) \
>>> +	dma_map_page(dev, page, offset, size, dir)
>>> +
>>> +#define storvsc_dma_unmap(dev, dma_range, dir)		\
>>> +		dma_unmap_page(dev, dma_range.dma,	\
>>> +			       dma_range.mapping_size,	\
>>> +			       dir ? DMA_FROM_DEVICE : DMA_TO_DEVICE)
>>> +
>>
>> Each of these macros is used only once.  IMHO, they don't
>> add a lot of value.  Just coding dma_map/unmap_page()
>> inline would be fine and eliminate these lines of code.
> 
> Yes, I had the same thought when looking over the code.  Especially
> as macros tend to further obsfucate the code (compared to actual helper
> functions).
> 
>>> +				for (i = 0; i < request->hvpg_count; i++)
>>> +					storvsc_dma_unmap(&device->device,
>>> +						request->dma_range[i],
>>> +						request->vstor_packet.vm_srb.data_in == READ_TYPE);
>>
>> I think you can directly get the DMA direction as request->cmd->sc_data_direction.
> 
> Yes.
> 
>>>
>>> @@ -1824,6 +1848,13 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>>>   		payload->range.len = length;
>>>   		payload->range.offset = offset_in_hvpg;
>>>
>>> +		cmd_request->dma_range = kcalloc(hvpg_count,
>>> +				 sizeof(*cmd_request->dma_range),
>>> +				 GFP_ATOMIC);
>>
>> With this patch, it appears that storvsc_queuecommand() is always
>> doing bounce buffering, even when running in a non-isolated VM.
>> The dma_range is always allocated, and the inner loop below does
>> the dma mapping for every I/O page.  The corresponding code in
>> storvsc_on_channel_callback() that does the dma unmap allows for
>> the dma_range to be NULL, but that never happens.
> 
> Maybe I'm missing something in the hyperv code, but I don't think
> dma_map_page would bounce buffer for the non-isolated case.  It
> will just return the physical address.

Yes, the swiotlb_force mode isn't enabled in non-isolated VM and so
dma_page_page() returns the physical address directly.

> 
>>> +		if (!cmd_request->dma_range) {
>>> +			ret = -ENOMEM;
>>
>> The other memory allocation failure in this function returns
>> SCSI_MLQUEUE_DEVICE_BUSY.   It may be debatable as to whether
>> that's the best approach, but that's a topic for a different patch.  I
>> would suggest being consistent and using the same return code
>> here.
> 
> Independent of if SCSI_MLQUEUE_DEVICE_BUSY is good (it it a common
> pattern in SCSI drivers), ->queuecommand can't return normal
> negative errnos.  It must return the SCSI_MLQUEUE_* codes or 0.
> We should probably change the return type of the method definition
> to a suitable enum to make this more clear.

Yes, will update. Thanks.

> 
>>> +				if (offset_in_hvpg) {
>>> +					payload->range.offset = dma & ~HV_HYP_PAGE_MASK;
>>> +					offset_in_hvpg = 0;
>>> +				}
>>
>> I'm not clear on why payload->range.offset needs to be set again.
>> Even after the dma mapping is done, doesn't the offset in the first
>> page have to be the same?  If it wasn't the same, Hyper-V wouldn't
>> be able to process the PFN list correctly.  In fact, couldn't the above
>> code just always set offset_in_hvpg = 0?
> 
> Careful.  DMA mapping is supposed to keep the offset in the page, but
> for that the DMA mapping code needs to know what the device considers a
> "page".  For that the driver needs to set the min_align_mask field in
> struct device_dma_parameters.

The default allocate unit of swiotlb bounce is IO_TLB_SIZE(2k).
Otherwise, I find some scsi request cmd's length is less than 100byte.
Keep a small unit can avoid wasting bounce buffer and just need to
update the offset.


> 
>>
>> The whole approach here is to do dma remapping on each individual page
>> of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
>> each scatterlist entry as a unit?  Each scatterlist entry describes a range of
>> physically contiguous memory.  After dma_map_sg(), the resulting dma
>> address must also refer to a physically contiguous range in the swiotlb
>> bounce buffer memory.   So at the top of the "for" loop over the scatterlist
>> entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
>> hvpfn value based on the dma address instead of sg_page().  But everything
>> else is the same, and the inner loop for populating the pfn_arry is unmodified.
>> Furthermore, the dma_range array that you've added is not needed, since
>> scatterlist entries already have a dma_address field for saving the mapped
>> address, and dma_unmap_sg() uses that field.
> 
> Yes, I think dma_map_sg is the right thing to use here, probably even
> for the non-isolated case so that we can get the hv drivers out of their
> little corner and into being more like a normal kernel driver.  That
> is, use the scsi_dma_map/scsi_dma_unmap helpers, and then iterate over
> the dma addresses one page at a time using for_each_sg_dma_page.
> 

I wonder whether we may introduce a new API scsi_dma_map_with_callback. 
Caller provides a callback and run callback in sg loop of 
dma_direct_map_sg(). Caller need to update some data structure in the sg 
loop. Here is such case that driver needs to populate 
payload->range.pfn_array[]. This is why I don't use dma_map_sg() here.


>>
>> One thing:  There's a maximum swiotlb mapping size, which I think works
>> out to be 256 Kbytes.  See swiotlb_max_mapping_size().  We need to make
>> sure that we don't get a scatterlist entry bigger than this size.  But I think
>> this already happens because you set the device->dma_mask field in
>> Patch 11 of this series.  __scsi_init_queue checks for this setting and
>> sets max_sectors to limits transfers to the max mapping size.
> 
> Indeed.
>
Michael Kelley (LINUX) Aug. 20, 2021, 4:08 p.m. UTC | #7
From: Tianyu Lan <ltykernel@gmail.com> Sent: Friday, August 20, 2021 8:20 AM
> 
> On 8/20/2021 2:17 AM, Michael Kelley wrote:
> > From: Tianyu Lan <ltykernel@gmail.com> Sent: Monday, August 9, 2021 10:56 AM
> >
> > I'm not clear on why payload->range.offset needs to be set again.
> > Even after the dma mapping is done, doesn't the offset in the first
> > page have to be the same?  If it wasn't the same, Hyper-V wouldn't
> > be able to process the PFN list correctly.  In fact, couldn't the above
> > code just always set offset_in_hvpg = 0?
> 
> The offset will be changed. The swiotlb bounce buffer is allocated with
> IO_TLB_SIZE(2K) as unit. So the offset here may be changed.
> 

We need to prevent the offset from changing.  The storvsc driver passes
just a PFN list to Hyper-V, plus an overall starting offset and length.  Unlike
the netvsc driver, each entry in the PFN list does *not* have its own offset
and length.  Hyper-V assumes that the list is "dense" and that there are
no holes (i.e., unused memory areas).

For example, consider an original buffer passed into storvsc_queuecommand()
of 8 Kbytes, but aligned with 1 Kbytes at the end of the first page, then
4 Kbytes in the second page, and 3 Kbytes in the beginning of the third page.
The offset of that first 1 Kbytes has to remain as 3 Kbytes.  If bounce buffering
moves it to a different offset, there's no way to tell Hyper-V to ignore the
remaining bytes in the first page (at least not without using a different
method to communicate with Hyper-V).   In such a case, the wrong
data will get transferred.  Presumably the easier solution is to set the
min_align_mask field as Christop suggested.

> 
> >
> >>   	}
> >
> > The whole approach here is to do dma remapping on each individual page
> > of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
> > each scatterlist entry as a unit?  Each scatterlist entry describes a range of
> > physically contiguous memory.  After dma_map_sg(), the resulting dma
> > address must also refer to a physically contiguous range in the swiotlb
> > bounce buffer memory.   So at the top of the "for" loop over the scatterlist
> > entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
> > hvpfn value based on the dma address instead of sg_page().  But everything
> > else is the same, and the inner loop for populating the pfn_arry is unmodified.
> > Furthermore, the dma_range array that you've added is not needed, since
> > scatterlist entries already have a dma_address field for saving the mapped
> > address, and dma_unmap_sg() uses that field.
> 
> I don't use dma_map_sg() here in order to avoid introducing one more
> loop(e,g dma_map_sg()). We already have a loop to populate
> cmd_request->dma_range[] and so do the dma map in the same loop.
> 

I'm not seeing where the additional loop comes from.  Storvsc
already has a loop through the sgl entries.  Retain that loop and call
dma_map_sg() with nents set to 1.  Then the sequence is
dma_map_sg() --> dma_map_sg_attrs() --> dma_direct_map_sg() ->
dma_direct_map_page().  The latter function will call swiotlb_map()
to map all pages of the sgl entry as a single operation.

Michael
Tianyu Lan Aug. 20, 2021, 6:04 p.m. UTC | #8
On 8/21/2021 12:08 AM, Michael Kelley wrote:
>>>>    	}
>>> The whole approach here is to do dma remapping on each individual page
>>> of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
>>> each scatterlist entry as a unit?  Each scatterlist entry describes a range of
>>> physically contiguous memory.  After dma_map_sg(), the resulting dma
>>> address must also refer to a physically contiguous range in the swiotlb
>>> bounce buffer memory.   So at the top of the "for" loop over the scatterlist
>>> entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
>>> hvpfn value based on the dma address instead of sg_page().  But everything
>>> else is the same, and the inner loop for populating the pfn_arry is unmodified.
>>> Furthermore, the dma_range array that you've added is not needed, since
>>> scatterlist entries already have a dma_address field for saving the mapped
>>> address, and dma_unmap_sg() uses that field.
>> I don't use dma_map_sg() here in order to avoid introducing one more
>> loop(e,g dma_map_sg()). We already have a loop to populate
>> cmd_request->dma_range[] and so do the dma map in the same loop.
>>
> I'm not seeing where the additional loop comes from.  Storvsc
> already has a loop through the sgl entries.  Retain that loop and call
> dma_map_sg() with nents set to 1.  Then the sequence is
> dma_map_sg() --> dma_map_sg_attrs() --> dma_direct_map_sg() ->
> dma_direct_map_page().  The latter function will call swiotlb_map()
> to map all pages of the sgl entry as a single operation.

After dma_map_sg(), we still need to go through scatter list again to 
populate payload->rrange.pfn_array. We may just go through the scatter 
list just once if dma_map_sg() accepts a callback and run it during go
through scatter list.
Michael Kelley (LINUX) Aug. 20, 2021, 7:22 p.m. UTC | #9
From: Tianyu Lan <ltykernel@gmail.com> Sent: Friday, August 20, 2021 11:04 AM
> 
> On 8/21/2021 12:08 AM, Michael Kelley wrote:
> >>>>    	}
> >>> The whole approach here is to do dma remapping on each individual page
> >>> of the I/O buffer.  But wouldn't it be possible to use dma_map_sg() to map
> >>> each scatterlist entry as a unit?  Each scatterlist entry describes a range of
> >>> physically contiguous memory.  After dma_map_sg(), the resulting dma
> >>> address must also refer to a physically contiguous range in the swiotlb
> >>> bounce buffer memory.   So at the top of the "for" loop over the scatterlist
> >>> entries, do dma_map_sg() if we're in an isolated VM.  Then compute the
> >>> hvpfn value based on the dma address instead of sg_page().  But everything
> >>> else is the same, and the inner loop for populating the pfn_arry is unmodified.
> >>> Furthermore, the dma_range array that you've added is not needed, since
> >>> scatterlist entries already have a dma_address field for saving the mapped
> >>> address, and dma_unmap_sg() uses that field.
> >> I don't use dma_map_sg() here in order to avoid introducing one more
> >> loop(e,g dma_map_sg()). We already have a loop to populate
> >> cmd_request->dma_range[] and so do the dma map in the same loop.
> >>
> > I'm not seeing where the additional loop comes from.  Storvsc
> > already has a loop through the sgl entries.  Retain that loop and call
> > dma_map_sg() with nents set to 1.  Then the sequence is
> > dma_map_sg() --> dma_map_sg_attrs() --> dma_direct_map_sg() ->
> > dma_direct_map_page().  The latter function will call swiotlb_map()
> > to map all pages of the sgl entry as a single operation.
> 
> After dma_map_sg(), we still need to go through scatter list again to
> populate payload->rrange.pfn_array. We may just go through the scatter
> list just once if dma_map_sg() accepts a callback and run it during go
> through scatter list.

Here's some code for what I'm suggesting (not even compile tested).
The only change is what's in the "if" clause of the SNP test.  dma_map_sg()
is called with the nents parameter set to one so that it only
processes one sgl entry each time it is called, and doesn't walk the
entire sgl.  Arguably, we don't even need the SNP test and the else
clause -- just always do what's in the if clause.

The corresponding code in storvsc_on_channel_callback would also
have to be changed.   And we still have to set the min_align_mask
so swiotlb will preserve any offset.  Storsvsc already has things set up
so that higher levels ensure there are no holes between sgl entries,
and that needs to stay true.

	if (sg_count) {
		unsigned int hvpgoff, hvpfns_to_add;
		unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
		unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
		u64 hvpfn;
		int nents;

		if (hvpg_count > MAX_PAGE_BUFFER_COUNT) {

			payload_sz = (hvpg_count * sizeof(u64) +
				      sizeof(struct vmbus_packet_mpb_array));
			payload = kzalloc(payload_sz, GFP_ATOMIC);
			if (!payload)
				return SCSI_MLQUEUE_DEVICE_BUSY;
		}

		payload->range.len = length;
		payload->range.offset = offset_in_hvpg;


		for (i = 0; sgl != NULL; sgl = sg_next(sgl)) {
			/*
			 * Init values for the current sgl entry. hvpgoff
			 * and hvpfns_to_add are in units of Hyper-V size
			 * pages. Handling the PAGE_SIZE != HV_HYP_PAGE_SIZE
			 * case also handles values of sgl->offset that are
			 * larger than PAGE_SIZE. Such offsets are handled
			 * even on other than the first sgl entry, provided
			 * they are a multiple of PAGE_SIZE.
			 */
			hvpgoff = HVPFN_DOWN(sgl->offset);

			if (hv_isolation_type_snp()) {
				nents = dma_map_sg(dev->device, sgl, 1, scmnd->sc_data_direction);
				if (nents != 1)
					<return error code so higher levels will retry>
				hvpfn = HVPFN_DOWN(sg_dma_address(sgl)) + hvpgoff;
			} else {
				hvpfn = page_to_hvpfn(sg_page(sgl)) + hvpgoff;
			}

			hvpfns_to_add = HVPFN_UP(sgl->offset + sgl->length) -
						hvpgoff;

			/*
			 * Fill the next portion of the PFN array with
			 * sequential Hyper-V PFNs for the contiguous physical
			 * memory described by the sgl entry. The end of the
			 * last sgl should be reached at the same time that
			 * the PFN array is filled.
			 */
			while (hvpfns_to_add--)
				payload->range.pfn_array[i++] = hvpfn++;
		}
	}
Christoph Hellwig Aug. 24, 2021, 8:46 a.m. UTC | #10
On Sat, Aug 21, 2021 at 02:04:11AM +0800, Tianyu Lan wrote:
> After dma_map_sg(), we still need to go through scatter list again to 
> populate payload->rrange.pfn_array. We may just go through the scatter list 
> just once if dma_map_sg() accepts a callback and run it during go
> through scatter list.

Iterating a cache hot array is way faster than doing lots of indirect
calls.
Christoph Hellwig Aug. 24, 2021, 8:49 a.m. UTC | #11
On Fri, Aug 20, 2021 at 03:40:08PM +0000, Michael Kelley wrote:
> I see that the swiotlb code gets and uses the min_align_mask field.  But
> the NVME driver is the only driver that ever sets it, so the value is zero
> in all other cases.  Does swiotlb just use PAGE_SIZE in that that case?  I
> couldn't tell from a quick glance at the swiotlb code.

The encoding isn't all that common.  I only know it for the RDMA memory
registration format, and RDMA in general does not interact very well
with swiotlb (although the in-kernel drivers should work fine, it is
userspace RDMA that is the problem).  It seems recently a new driver
using the format (mpi3mr) also showed up.  All these should probably set
the min_align_mask.
diff mbox series

Patch

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 328bb961c281..78320719bdd8 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -21,6 +21,8 @@ 
 #include <linux/device.h>
 #include <linux/hyperv.h>
 #include <linux/blkdev.h>
+#include <linux/io.h>
+#include <linux/dma-mapping.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_host.h>
@@ -427,6 +429,8 @@  struct storvsc_cmd_request {
 	u32 payload_sz;
 
 	struct vstor_packet vstor_packet;
+	u32 hvpg_count;
+	struct hv_dma_range *dma_range;
 };
 
 
@@ -509,6 +513,14 @@  struct storvsc_scan_work {
 	u8 tgt_id;
 };
 
+#define storvsc_dma_map(dev, page, offset, size, dir) \
+	dma_map_page(dev, page, offset, size, dir)
+
+#define storvsc_dma_unmap(dev, dma_range, dir)		\
+		dma_unmap_page(dev, dma_range.dma,	\
+			       dma_range.mapping_size,	\
+			       dir ? DMA_FROM_DEVICE : DMA_TO_DEVICE)
+
 static void storvsc_device_scan(struct work_struct *work)
 {
 	struct storvsc_scan_work *wrk;
@@ -1260,6 +1272,7 @@  static void storvsc_on_channel_callback(void *context)
 	struct hv_device *device;
 	struct storvsc_device *stor_device;
 	struct Scsi_Host *shost;
+	int i;
 
 	if (channel->primary_channel != NULL)
 		device = channel->primary_channel->device_obj;
@@ -1314,6 +1327,15 @@  static void storvsc_on_channel_callback(void *context)
 				request = (struct storvsc_cmd_request *)scsi_cmd_priv(scmnd);
 			}
 
+			if (request->dma_range) {
+				for (i = 0; i < request->hvpg_count; i++)
+					storvsc_dma_unmap(&device->device,
+						request->dma_range[i],
+						request->vstor_packet.vm_srb.data_in == READ_TYPE);
+
+				kfree(request->dma_range);
+			}
+
 			storvsc_on_receive(stor_device, packet, request);
 			continue;
 		}
@@ -1810,7 +1832,9 @@  static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
 		unsigned int hvpgoff, hvpfns_to_add;
 		unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
 		unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
+		dma_addr_t dma;
 		u64 hvpfn;
+		u32 size;
 
 		if (hvpg_count > MAX_PAGE_BUFFER_COUNT) {
 
@@ -1824,6 +1848,13 @@  static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
 		payload->range.len = length;
 		payload->range.offset = offset_in_hvpg;
 
+		cmd_request->dma_range = kcalloc(hvpg_count,
+				 sizeof(*cmd_request->dma_range),
+				 GFP_ATOMIC);
+		if (!cmd_request->dma_range) {
+			ret = -ENOMEM;
+			goto free_payload;
+		}
 
 		for (i = 0; sgl != NULL; sgl = sg_next(sgl)) {
 			/*
@@ -1847,9 +1878,29 @@  static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
 			 * last sgl should be reached at the same time that
 			 * the PFN array is filled.
 			 */
-			while (hvpfns_to_add--)
-				payload->range.pfn_array[i++] =	hvpfn++;
+			while (hvpfns_to_add--) {
+				size = min(HV_HYP_PAGE_SIZE - offset_in_hvpg,
+					   (unsigned long)length);
+				dma = storvsc_dma_map(&dev->device, pfn_to_page(hvpfn++),
+						      offset_in_hvpg, size,
+						      scmnd->sc_data_direction);
+				if (dma_mapping_error(&dev->device, dma)) {
+					ret = -ENOMEM;
+					goto free_dma_range;
+				}
+
+				if (offset_in_hvpg) {
+					payload->range.offset = dma & ~HV_HYP_PAGE_MASK;
+					offset_in_hvpg = 0;
+				}
+
+				cmd_request->dma_range[i].dma = dma;
+				cmd_request->dma_range[i].mapping_size = size;
+				payload->range.pfn_array[i++] = dma >> HV_HYP_PAGE_SHIFT;
+				length -= size;
+			}
 		}
+		cmd_request->hvpg_count = hvpg_count;
 	}
 
 	cmd_request->payload = payload;
@@ -1860,13 +1911,20 @@  static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
 	put_cpu();
 
 	if (ret == -EAGAIN) {
-		if (payload_sz > sizeof(cmd_request->mpb))
-			kfree(payload);
 		/* no more space */
-		return SCSI_MLQUEUE_DEVICE_BUSY;
+		ret = SCSI_MLQUEUE_DEVICE_BUSY;
+		goto free_dma_range;
 	}
 
 	return 0;
+
+free_dma_range:
+	kfree(cmd_request->dma_range);
+
+free_payload:
+	if (payload_sz > sizeof(cmd_request->mpb))
+		kfree(payload);
+	return ret;
 }
 
 static struct scsi_host_template scsi_driver = {