diff mbox series

[V2,5/6] net: netvsc: Add Isolation VM support for netvsc driver

Message ID 20211123143039.331929-6-ltykernel@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series x86/Hyper-V: Add Hyper-V Isolation VM support(Second part) | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Tianyu Lan Nov. 23, 2021, 2:30 p.m. UTC
From: Tianyu Lan <Tianyu.Lan@microsoft.com>

In Isolation VM, all shared memory with host needs to mark visible
to host via hvcall. vmbus_establish_gpadl() has already done it for
netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
pagebuffer() stills need to be handled. Use DMA API to map/umap
these memory during sending/receiving packet and Hyper-V swiotlb
bounce buffer dma address will be returned. The swiotlb bounce buffer
has been masked to be visible to host during boot up.

Allocate rx/tx ring buffer via dma_alloc_noncontiguous() in Isolation
VM. After calling vmbus_establish_gpadl() which marks these pages visible
to host, map these pages unencrypted addes space via dma_vmap_noncontiguous().

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   |   5 +
 drivers/net/hyperv/netvsc.c       | 192 +++++++++++++++++++++++++++---
 drivers/net/hyperv/rndis_filter.c |   2 +
 include/linux/hyperv.h            |   6 +
 4 files changed, 190 insertions(+), 15 deletions(-)

Comments

Michael Kelley (LINUX) Nov. 23, 2021, 5:55 p.m. UTC | #1
From: Tianyu Lan <ltykernel@gmail.com> Sent: Tuesday, November 23, 2021 6:31 AM
> 
> In Isolation VM, all shared memory with host needs to mark visible
> to host via hvcall. vmbus_establish_gpadl() has already done it for
> netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
> pagebuffer() stills need to be handled. Use DMA API to map/umap
> these memory during sending/receiving packet and Hyper-V swiotlb
> bounce buffer dma address will be returned. The swiotlb bounce buffer
> has been masked to be visible to host during boot up.
> 
> Allocate rx/tx ring buffer via dma_alloc_noncontiguous() in Isolation
> VM. After calling vmbus_establish_gpadl() which marks these pages visible
> to host, map these pages unencrypted addes space via dma_vmap_noncontiguous().
> 
> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
> ---
>  drivers/net/hyperv/hyperv_net.h   |   5 +
>  drivers/net/hyperv/netvsc.c       | 192 +++++++++++++++++++++++++++---
>  drivers/net/hyperv/rndis_filter.c |   2 +
>  include/linux/hyperv.h            |   6 +
>  4 files changed, 190 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index 315278a7cf88..31c77a00d01e 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -164,6 +164,7 @@ struct hv_netvsc_packet {
>  	u32 total_bytes;
>  	u32 send_buf_index;
>  	u32 total_data_buflen;
> +	struct hv_dma_range *dma_range;
>  };
> 
>  #define NETVSC_HASH_KEYLEN 40
> @@ -1074,6 +1075,7 @@ struct netvsc_device {
> 
>  	/* Receive buffer allocated by us but manages by NetVSP */
>  	void *recv_buf;
> +	struct sg_table *recv_sgt;
>  	u32 recv_buf_size; /* allocated bytes */
>  	struct vmbus_gpadl recv_buf_gpadl_handle;
>  	u32 recv_section_cnt;
> @@ -1082,6 +1084,7 @@ struct netvsc_device {
> 
>  	/* Send buffer allocated by us */
>  	void *send_buf;
> +	struct sg_table *send_sgt;
>  	u32 send_buf_size;
>  	struct vmbus_gpadl send_buf_gpadl_handle;
>  	u32 send_section_cnt;
> @@ -1731,4 +1734,6 @@ struct rndis_message {
>  #define RETRY_US_HI	10000
>  #define RETRY_MAX	2000	/* >10 sec */
> 
> +void netvsc_dma_unmap(struct hv_device *hv_dev,
> +		      struct hv_netvsc_packet *packet);
>  #endif /* _HYPERV_NET_H */
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 396bc1c204e6..9cdc71930830 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -20,6 +20,7 @@
>  #include <linux/vmalloc.h>
>  #include <linux/rtnetlink.h>
>  #include <linux/prefetch.h>
> +#include <linux/gfp.h>
> 
>  #include <asm/sync_bitops.h>
>  #include <asm/mshyperv.h>
> @@ -146,15 +147,39 @@ static struct netvsc_device *alloc_net_device(void)
>  	return net_device;
>  }
> 
> +static struct hv_device *netvsc_channel_to_device(struct vmbus_channel *channel)
> +{
> +	struct vmbus_channel *primary = channel->primary_channel;
> +
> +	return primary ? primary->device_obj : channel->device_obj;
> +}
> +
>  static void free_netvsc_device(struct rcu_head *head)
>  {
>  	struct netvsc_device *nvdev
>  		= container_of(head, struct netvsc_device, rcu);
> +	struct hv_device *dev =
> +		netvsc_channel_to_device(nvdev->chan_table[0].channel);
>  	int i;
> 
>  	kfree(nvdev->extension);
> -	vfree(nvdev->recv_buf);
> -	vfree(nvdev->send_buf);
> +
> +	if (nvdev->recv_sgt) {
> +		dma_vunmap_noncontiguous(&dev->device, nvdev->recv_buf);
> +		dma_free_noncontiguous(&dev->device, nvdev->recv_buf_size,
> +				       nvdev->recv_sgt, DMA_FROM_DEVICE);
> +	} else {
> +		vfree(nvdev->recv_buf);
> +	}
> +
> +	if (nvdev->send_sgt) {
> +		dma_vunmap_noncontiguous(&dev->device, nvdev->send_buf);
> +		dma_free_noncontiguous(&dev->device, nvdev->send_buf_size,
> +				       nvdev->send_sgt, DMA_TO_DEVICE);
> +	} else {
> +		vfree(nvdev->send_buf);
> +	}
> +
>  	kfree(nvdev->send_section_map);
> 
>  	for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
> @@ -348,7 +373,21 @@ static int netvsc_init_buf(struct hv_device *device,
>  		buf_size = min_t(unsigned int, buf_size,
>  				 NETVSC_RECEIVE_BUFFER_SIZE_LEGACY);
> 
> -	net_device->recv_buf = vzalloc(buf_size);
> +	if (hv_isolation_type_snp()) {
> +		net_device->recv_sgt =
> +			dma_alloc_noncontiguous(&device->device, buf_size,
> +						DMA_FROM_DEVICE, GFP_KERNEL, 0);
> +		if (!net_device->recv_sgt) {
> +			pr_err("Fail to allocate recv buffer buf_size %d.\n.", buf_size);
> +			ret = -ENOMEM;
> +			goto cleanup;
> +		}
> +
> +		net_device->recv_buf = (void *)net_device->recv_sgt->sgl->dma_address;

Use sg_dma_address() macro.

> +	} else {
> +		net_device->recv_buf = vzalloc(buf_size);
> +	}
> +
>  	if (!net_device->recv_buf) {
>  		netdev_err(ndev,
>  			   "unable to allocate receive buffer of size %u\n",
> @@ -357,8 +396,6 @@ static int netvsc_init_buf(struct hv_device *device,
>  		goto cleanup;
>  	}
> 
> -	net_device->recv_buf_size = buf_size;
> -
>  	/*
>  	 * Establish the gpadl handle for this buffer on this
>  	 * channel.  Note: This call uses the vmbus connection rather
> @@ -373,6 +410,19 @@ static int netvsc_init_buf(struct hv_device *device,
>  		goto cleanup;
>  	}
> 
> +	if (net_device->recv_sgt) {
> +		net_device->recv_buf =
> +			dma_vmap_noncontiguous(&device->device, buf_size,
> +					       net_device->recv_sgt);
> +		if (!net_device->recv_buf) {
> +			pr_err("Fail to vmap recv buffer.\n");
> +			ret = -ENOMEM;
> +			goto cleanup;
> +		}
> +	}
> +
> +	net_device->recv_buf_size = buf_size;
> +
>  	/* Notify the NetVsp of the gpadl handle */
>  	init_packet = &net_device->channel_init_pkt;
>  	memset(init_packet, 0, sizeof(struct nvsp_message));
> @@ -454,14 +504,27 @@ static int netvsc_init_buf(struct hv_device *device,
>  	buf_size = device_info->send_sections * device_info->send_section_size;
>  	buf_size = round_up(buf_size, PAGE_SIZE);
> 
> -	net_device->send_buf = vzalloc(buf_size);
> +	if (hv_isolation_type_snp()) {
> +		net_device->send_sgt =
> +			dma_alloc_noncontiguous(&device->device, buf_size,
> +						DMA_TO_DEVICE, GFP_KERNEL, 0);
> +		if (!net_device->send_sgt) {
> +			pr_err("Fail to allocate send buffer buf_size %d.\n.", buf_size);
> +			ret = -ENOMEM;
> +			goto cleanup;
> +		}
> +
> +		net_device->send_buf = (void *)net_device->send_sgt->sgl->dma_address;

Use sg_dma_address() macro.

> +	} else {
> +		net_device->send_buf = vzalloc(buf_size);
> +	}
> +
>  	if (!net_device->send_buf) {
>  		netdev_err(ndev, "unable to allocate send buffer of size %u\n",
>  			   buf_size);
>  		ret = -ENOMEM;
>  		goto cleanup;
>  	}
> -	net_device->send_buf_size = buf_size;
> 
>  	/* Establish the gpadl handle for this buffer on this
>  	 * channel.  Note: This call uses the vmbus connection rather
> @@ -476,6 +539,19 @@ static int netvsc_init_buf(struct hv_device *device,
>  		goto cleanup;
>  	}
> 
> +	if (net_device->send_sgt) {
> +		net_device->send_buf =
> +			dma_vmap_noncontiguous(&device->device, buf_size,
> +					       net_device->send_sgt);
> +		if (!net_device->send_buf) {
> +			pr_err("Fail to vmap send buffer.\n");
> +			ret = -ENOMEM;
> +			goto cleanup;
> +		}
> +	}
> +
> +	net_device->send_buf_size = buf_size;
> +
>  	/* Notify the NetVsp of the gpadl handle */
>  	init_packet = &net_device->channel_init_pkt;
>  	memset(init_packet, 0, sizeof(struct nvsp_message));
> @@ -766,7 +842,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
> 
>  	/* Notify the layer above us */
>  	if (likely(skb)) {
> -		const struct hv_netvsc_packet *packet
> +		struct hv_netvsc_packet *packet
>  			= (struct hv_netvsc_packet *)skb->cb;
>  		u32 send_index = packet->send_buf_index;
>  		struct netvsc_stats *tx_stats;
> @@ -782,6 +858,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
>  		tx_stats->bytes += packet->total_bytes;
>  		u64_stats_update_end(&tx_stats->syncp);
> 
> +		netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
>  		napi_consume_skb(skb, budget);
>  	}
> 
> @@ -946,6 +1023,87 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
>  		memset(dest, 0, padding);
>  }
> 
> +void netvsc_dma_unmap(struct hv_device *hv_dev,
> +		      struct hv_netvsc_packet *packet)
> +{
> +	u32 page_count = packet->cp_partial ?
> +		packet->page_buf_cnt - packet->rmsg_pgcnt :
> +		packet->page_buf_cnt;
> +	int i;
> +
> +	if (!hv_is_isolation_supported())
> +		return;
> +
> +	if (!packet->dma_range)
> +		return;
> +
> +	for (i = 0; i < page_count; i++)
> +		dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
> +				 packet->dma_range[i].mapping_size,
> +				 DMA_TO_DEVICE);
> +
> +	kfree(packet->dma_range);
> +}
> +
> +/* netvsc_dma_map - Map swiotlb bounce buffer with data page of
> + * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
> + * VM.
> + *
> + * In isolation VM, netvsc send buffer has been marked visible to
> + * host and so the data copied to send buffer doesn't need to use
> + * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
> + * may not be copied to send buffer and so these pages need to be
> + * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
> + * that. The pfns in the struct hv_page_buffer need to be converted
> + * to bounce buffer's pfn. The loop here is necessary because the
> + * entries in the page buffer array are not necessarily full
> + * pages of data.  Each entry in the array has a separate offset and
> + * len that may be non-zero, even for entries in the middle of the
> + * array.  And the entries are not physically contiguous.  So each
> + * entry must be individually mapped rather than as a contiguous unit.
> + * So not use dma_map_sg() here.
> + */
> +static int netvsc_dma_map(struct hv_device *hv_dev,
> +			  struct hv_netvsc_packet *packet,
> +			  struct hv_page_buffer *pb)
> +{
> +	u32 page_count =  packet->cp_partial ?
> +		packet->page_buf_cnt - packet->rmsg_pgcnt :
> +		packet->page_buf_cnt;
> +	dma_addr_t dma;
> +	int i;
> +
> +	if (!hv_is_isolation_supported())
> +		return 0;
> +
> +	packet->dma_range = kcalloc(page_count,
> +				    sizeof(*packet->dma_range),
> +				    GFP_KERNEL);
> +	if (!packet->dma_range)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < page_count; i++) {
> +		char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)
> +					 + pb[i].offset);
> +		u32 len = pb[i].len;
> +
> +		dma = dma_map_single(&hv_dev->device, src, len,
> +				     DMA_TO_DEVICE);
> +		if (dma_mapping_error(&hv_dev->device, dma)) {
> +			kfree(packet->dma_range);
> +			return -ENOMEM;
> +		}
> +
> +		packet->dma_range[i].dma = dma;
> +		packet->dma_range[i].mapping_size = len;
> +		pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
> +		pb[i].offset = offset_in_hvpage(dma);
> +		pb[i].len = len;

As noted in comments on an earlier version of this patch, the
pb[i].len and .offset fields should not be changed by doing
dma_map_single().  So there's no need to set them again here.  Adding
a comment to that effect might be good.

> +	}
> +
> +	return 0;
> +}
> +
>  static inline int netvsc_send_pkt(
>  	struct hv_device *device,
>  	struct hv_netvsc_packet *packet,
> @@ -986,14 +1144,24 @@ static inline int netvsc_send_pkt(
> 
>  	trace_nvsp_send_pkt(ndev, out_channel, rpkt);
> 
> +	packet->dma_range = NULL;
>  	if (packet->page_buf_cnt) {
>  		if (packet->cp_partial)
>  			pb += packet->rmsg_pgcnt;
> 
> +		ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb);
> +		if (ret) {
> +			ret = -EAGAIN;
> +			goto exit;
> +		}
> +
>  		ret = vmbus_sendpacket_pagebuffer(out_channel,
>  						  pb, packet->page_buf_cnt,
>  						  &nvmsg, sizeof(nvmsg),
>  						  req_id);
> +
> +		if (ret)
> +			netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
>  	} else {
>  		ret = vmbus_sendpacket(out_channel,
>  				       &nvmsg, sizeof(nvmsg),
> @@ -1001,6 +1169,7 @@ static inline int netvsc_send_pkt(
>  				       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
>  	}
> 
> +exit:
>  	if (ret == 0) {
>  		atomic_inc_return(&nvchan->queue_sends);
> 
> @@ -1515,13 +1684,6 @@ static int netvsc_process_raw_pkt(struct hv_device *device,
>  	return 0;
>  }
> 
> -static struct hv_device *netvsc_channel_to_device(struct vmbus_channel *channel)
> -{
> -	struct vmbus_channel *primary = channel->primary_channel;
> -
> -	return primary ? primary->device_obj : channel->device_obj;
> -}
> -
>  /* Network processing softirq
>   * Process data in incoming ring buffer from host
>   * Stops when ring is empty or budget is met or exceeded.
> diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
> index f6c9c2a670f9..448fcc325ed7 100644
> --- a/drivers/net/hyperv/rndis_filter.c
> +++ b/drivers/net/hyperv/rndis_filter.c
> @@ -361,6 +361,8 @@ static void rndis_filter_receive_response(struct net_device *ndev,
>  			}
>  		}
> 
> +		netvsc_dma_unmap(((struct net_device_context *)
> +			netdev_priv(ndev))->device_ctx, &request->pkt);
>  		complete(&request->wait_event);
>  	} else {
>  		netdev_err(ndev,
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index 4d44fb3b3f1c..8882e46d1070 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -25,6 +25,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/reciprocal_div.h>
>  #include <asm/hyperv-tlfs.h>
> +#include <linux/dma-map-ops.h>
> 
>  #define MAX_PAGE_BUFFER_COUNT				32
>  #define MAX_MULTIPAGE_BUFFER_COUNT			32 /* 128K */
> @@ -1583,6 +1584,11 @@ struct hyperv_service_callback {
>  	void (*callback)(void *context);
>  };
> 
> +struct hv_dma_range {
> +	dma_addr_t dma;
> +	u32 mapping_size;
> +};
> +
>  #define MAX_SRV_VER	0x7ffffff
>  extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
>  				const int *fw_version, int fw_vercnt,
> --
> 2.25.1
Michael Kelley (LINUX) Nov. 24, 2021, 5:03 p.m. UTC | #2
From: Tianyu Lan <ltykernel@gmail.com> Sent: Tuesday, November 23, 2021 6:31 AM
> 
> In Isolation VM, all shared memory with host needs to mark visible
> to host via hvcall. vmbus_establish_gpadl() has already done it for
> netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
> pagebuffer() stills need to be handled. Use DMA API to map/umap
> these memory during sending/receiving packet and Hyper-V swiotlb
> bounce buffer dma address will be returned. The swiotlb bounce buffer
> has been masked to be visible to host during boot up.
> 
> Allocate rx/tx ring buffer via dma_alloc_noncontiguous() in Isolation
> VM. After calling vmbus_establish_gpadl() which marks these pages visible
> to host, map these pages unencrypted addes space via dma_vmap_noncontiguous().
> 

The big unresolved topic is how best to do the allocation and mapping of the big
netvsc send and receive buffers.  Let me summarize and make a recommendation.

Background
==========
1.  Each Hyper-V synthetic network device requires a large pre-allocated receive
     buffer (defaults to 16 Mbytes) and a similar send buffer (defaults to 1 Mbyte).
2.  The buffers are allocated in guest memory and shared with the Hyper-V host.
     As such, in the Hyper-V SNP environment, the memory must be unencrypted
     and accessed in the Hyper-V guest with shared_gpa_boundary (i.e., VTOM)
     added to the physical memory address.
3.  The buffers need *not* be contiguous in guest physical memory, but must be
     contiguously mapped in guest kernel virtual space.
4.  Network devices may come and go during the life of the VM, so allocation of
     these buffers and their mappings may be done after Linux has been running for
     a long time.
5.  Performance of the allocation and mapping process is not an issue since it is
     done only on synthetic network device add/remove.
6.  So the primary goals are an appropriate logical abstraction, code that is
     simple and straightforward, and efficient memory usage.

Approaches
==========
During the development of these patches, four approaches have been
implemented:

1.  Two virtual mappings:  One from vmalloc() to allocate the guest memory, and
     the second from vmap_pfns() after adding the shared_gpa_boundary.   This is
     implemented in Hyper-V or netvsc specific code, with no use of DMA APIs.
     No separate list of physical pages is maintained, so for creating the second
     mapping, the PFN list is assembled temporarily by doing virt-to-phys()
     page-by-page on the vmalloc mapping, and then discarded because it is no
     longer needed.  [v4 of the original patch series.]

2.  Two virtual mappings as in (1) above, but implemented via new DMA calls
     dma_map_decrypted() and dma_unmap_encrypted().  [v3 of the original
     patch series.]

3.  Two virtual mappings as in (1) above, but implemented via DMA noncontiguous
      allocation and mapping calls, as enhanced to allow for custom map/unmap
      implementations.  A list of physical pages is maintained in the dma_sgt_handle
      as expected by the DMA noncontiguous API.  [New split-off patch series v1 & v2]

4.   Single virtual mapping from vmap_pfns().  The netvsc driver allocates physical
      memory via alloc_pages() with as much contiguity as possible, and maintains a
      list of physical pages and ranges.   Single virtual map is setup with vmap_pfns()
      after adding shared_gpa_boundary.  [v5 of the original patch series.]

Both implementations using DMA APIs use very little of the existing DMA
machinery.  Both require extensions to the DMA APIs, and custom ops functions.
While in some sense the netvsc send and receive buffers involve DMA, they
do not require any DMA actions on a per-I/O basis.  It seems better to me to
not try to fit these two buffers into the DMA model as a one-off.  Let's just
use Hyper-V specific code to allocate and map them, as is done with the
Hyper-V VMbus channel ring buffers.

That leaves approaches (1) and (4) above.  Between those two, (1) is
simpler even though there are two virtual mappings.  Using alloc_pages() as
in (4) is messy and there's no real benefit to using higher order allocations.
(4) also requires maintaining a separate list of PFNs and ranges, which offsets
some of the benefits to having only one virtual mapping active at any point in
time.

I don't think there's a clear "right" answer, so it's a judgment call.  We've
explored what other approaches would look like, and I'd say let's go with
(1) as the simpler approach.  Thoughts?

Michael
Haiyang Zhang Nov. 25, 2021, 9:58 p.m. UTC | #3
> -----Original Message-----
> From: Michael Kelley (LINUX) <mikelley@microsoft.com>
> Sent: Wednesday, November 24, 2021 12:03 PM
> To: Tianyu Lan <ltykernel@gmail.com>; tglx@linutronix.de; mingo@redhat.com; bp@alien8.de;
> dave.hansen@linux.intel.com; x86@kernel.org; hpa@zytor.com; luto@kernel.org;
> peterz@infradead.org; jgross@suse.com; sstabellini@kernel.org; boris.ostrovsky@oracle.com;
> KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Stephen
> Hemminger <sthemmin@microsoft.com>; wei.liu@kernel.org; Dexuan Cui <decui@microsoft.com>;
> joro@8bytes.org; will@kernel.org; davem@davemloft.net; kuba@kernel.org; jejb@linux.ibm.com;
> martin.petersen@oracle.com; hch@lst.de; m.szyprowski@samsung.com; robin.murphy@arm.com;
> Tianyu Lan <Tianyu.Lan@microsoft.com>; thomas.lendacky@amd.com; xen-
> devel@lists.xenproject.org
> Cc: iommu@lists.linux-foundation.org; linux-hyperv@vger.kernel.org; linux-
> kernel@vger.kernel.org; linux-scsi@vger.kernel.org; netdev@vger.kernel.org; vkuznets
> <vkuznets@redhat.com>; brijesh.singh@amd.com; konrad.wilk@oracle.com;
> parri.andrea@gmail.com; dave.hansen@intel.com
> Subject: RE: [PATCH V2 5/6] net: netvsc: Add Isolation VM support for netvsc driver
> 
> From: Tianyu Lan <ltykernel@gmail.com> Sent: Tuesday, November 23, 2021 6:31 AM
> >
> > In Isolation VM, all shared memory with host needs to mark visible to
> > host via hvcall. vmbus_establish_gpadl() has already done it for
> > netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
> > pagebuffer() stills need to be handled. Use DMA API to map/umap these
> > memory during sending/receiving packet and Hyper-V swiotlb bounce
> > buffer dma address will be returned. The swiotlb bounce buffer has
> > been masked to be visible to host during boot up.
> >
> > Allocate rx/tx ring buffer via dma_alloc_noncontiguous() in Isolation
> > VM. After calling vmbus_establish_gpadl() which marks these pages
> > visible to host, map these pages unencrypted addes space via dma_vmap_noncontiguous().
> >
> 
> The big unresolved topic is how best to do the allocation and mapping of the big netvsc
> send and receive buffers.  Let me summarize and make a recommendation.
> 
> Background
> ==========
> 1.  Each Hyper-V synthetic network device requires a large pre-allocated receive
>      buffer (defaults to 16 Mbytes) and a similar send buffer (defaults to 1 Mbyte).
> 2.  The buffers are allocated in guest memory and shared with the Hyper-V host.
>      As such, in the Hyper-V SNP environment, the memory must be unencrypted
>      and accessed in the Hyper-V guest with shared_gpa_boundary (i.e., VTOM)
>      added to the physical memory address.
> 3.  The buffers need *not* be contiguous in guest physical memory, but must be
>      contiguously mapped in guest kernel virtual space.
> 4.  Network devices may come and go during the life of the VM, so allocation of
>      these buffers and their mappings may be done after Linux has been running for
>      a long time.
> 5.  Performance of the allocation and mapping process is not an issue since it is
>      done only on synthetic network device add/remove.
> 6.  So the primary goals are an appropriate logical abstraction, code that is
>      simple and straightforward, and efficient memory usage.
> 
> Approaches
> ==========
> During the development of these patches, four approaches have been
> implemented:
> 
> 1.  Two virtual mappings:  One from vmalloc() to allocate the guest memory, and
>      the second from vmap_pfns() after adding the shared_gpa_boundary.   This is
>      implemented in Hyper-V or netvsc specific code, with no use of DMA APIs.
>      No separate list of physical pages is maintained, so for creating the second
>      mapping, the PFN list is assembled temporarily by doing virt-to-phys()
>      page-by-page on the vmalloc mapping, and then discarded because it is no
>      longer needed.  [v4 of the original patch series.]
> 
> 2.  Two virtual mappings as in (1) above, but implemented via new DMA calls
>      dma_map_decrypted() and dma_unmap_encrypted().  [v3 of the original
>      patch series.]
> 
> 3.  Two virtual mappings as in (1) above, but implemented via DMA noncontiguous
>       allocation and mapping calls, as enhanced to allow for custom map/unmap
>       implementations.  A list of physical pages is maintained in the dma_sgt_handle
>       as expected by the DMA noncontiguous API.  [New split-off patch series v1 & v2]
> 
> 4.   Single virtual mapping from vmap_pfns().  The netvsc driver allocates physical
>       memory via alloc_pages() with as much contiguity as possible, and maintains a
>       list of physical pages and ranges.   Single virtual map is setup with vmap_pfns()
>       after adding shared_gpa_boundary.  [v5 of the original patch series.]
> 
> Both implementations using DMA APIs use very little of the existing DMA machinery.  Both
> require extensions to the DMA APIs, and custom ops functions.
> While in some sense the netvsc send and receive buffers involve DMA, they do not require
> any DMA actions on a per-I/O basis.  It seems better to me to not try to fit these two
> buffers into the DMA model as a one-off.  Let's just use Hyper-V specific code to allocate
> and map them, as is done with the Hyper-V VMbus channel ring buffers.
> 
> That leaves approaches (1) and (4) above.  Between those two, (1) is simpler even though
> there are two virtual mappings.  Using alloc_pages() as in (4) is messy and there's no
> real benefit to using higher order allocations.
> (4) also requires maintaining a separate list of PFNs and ranges, which offsets some of
> the benefits to having only one virtual mapping active at any point in time.
> 
> I don't think there's a clear "right" answer, so it's a judgment call.  We've explored
> what other approaches would look like, and I'd say let's go with
> (1) as the simpler approach.  Thoughts?
> 
I agree with the following goal:
"So the primary goals are an appropriate logical abstraction, code that is
     simple and straightforward, and efficient memory usage."

And the Approach #1 looks better to me as well.

Thanks,
- Haiyang
diff mbox series

Patch

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 315278a7cf88..31c77a00d01e 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -164,6 +164,7 @@  struct hv_netvsc_packet {
 	u32 total_bytes;
 	u32 send_buf_index;
 	u32 total_data_buflen;
+	struct hv_dma_range *dma_range;
 };
 
 #define NETVSC_HASH_KEYLEN 40
@@ -1074,6 +1075,7 @@  struct netvsc_device {
 
 	/* Receive buffer allocated by us but manages by NetVSP */
 	void *recv_buf;
+	struct sg_table *recv_sgt;
 	u32 recv_buf_size; /* allocated bytes */
 	struct vmbus_gpadl recv_buf_gpadl_handle;
 	u32 recv_section_cnt;
@@ -1082,6 +1084,7 @@  struct netvsc_device {
 
 	/* Send buffer allocated by us */
 	void *send_buf;
+	struct sg_table *send_sgt;
 	u32 send_buf_size;
 	struct vmbus_gpadl send_buf_gpadl_handle;
 	u32 send_section_cnt;
@@ -1731,4 +1734,6 @@  struct rndis_message {
 #define RETRY_US_HI	10000
 #define RETRY_MAX	2000	/* >10 sec */
 
+void netvsc_dma_unmap(struct hv_device *hv_dev,
+		      struct hv_netvsc_packet *packet);
 #endif /* _HYPERV_NET_H */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 396bc1c204e6..9cdc71930830 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -20,6 +20,7 @@ 
 #include <linux/vmalloc.h>
 #include <linux/rtnetlink.h>
 #include <linux/prefetch.h>
+#include <linux/gfp.h>
 
 #include <asm/sync_bitops.h>
 #include <asm/mshyperv.h>
@@ -146,15 +147,39 @@  static struct netvsc_device *alloc_net_device(void)
 	return net_device;
 }
 
+static struct hv_device *netvsc_channel_to_device(struct vmbus_channel *channel)
+{
+	struct vmbus_channel *primary = channel->primary_channel;
+
+	return primary ? primary->device_obj : channel->device_obj;
+}
+
 static void free_netvsc_device(struct rcu_head *head)
 {
 	struct netvsc_device *nvdev
 		= container_of(head, struct netvsc_device, rcu);
+	struct hv_device *dev =
+		netvsc_channel_to_device(nvdev->chan_table[0].channel);
 	int i;
 
 	kfree(nvdev->extension);
-	vfree(nvdev->recv_buf);
-	vfree(nvdev->send_buf);
+
+	if (nvdev->recv_sgt) {
+		dma_vunmap_noncontiguous(&dev->device, nvdev->recv_buf);
+		dma_free_noncontiguous(&dev->device, nvdev->recv_buf_size,
+				       nvdev->recv_sgt, DMA_FROM_DEVICE);
+	} else {
+		vfree(nvdev->recv_buf);
+	}
+
+	if (nvdev->send_sgt) {
+		dma_vunmap_noncontiguous(&dev->device, nvdev->send_buf);
+		dma_free_noncontiguous(&dev->device, nvdev->send_buf_size,
+				       nvdev->send_sgt, DMA_TO_DEVICE);
+	} else {
+		vfree(nvdev->send_buf);
+	}
+
 	kfree(nvdev->send_section_map);
 
 	for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
@@ -348,7 +373,21 @@  static int netvsc_init_buf(struct hv_device *device,
 		buf_size = min_t(unsigned int, buf_size,
 				 NETVSC_RECEIVE_BUFFER_SIZE_LEGACY);
 
-	net_device->recv_buf = vzalloc(buf_size);
+	if (hv_isolation_type_snp()) {
+		net_device->recv_sgt =
+			dma_alloc_noncontiguous(&device->device, buf_size,
+						DMA_FROM_DEVICE, GFP_KERNEL, 0);
+		if (!net_device->recv_sgt) {
+			pr_err("Fail to allocate recv buffer buf_size %d.\n.", buf_size);
+			ret = -ENOMEM;
+			goto cleanup;
+		}
+
+		net_device->recv_buf = (void *)net_device->recv_sgt->sgl->dma_address;
+	} else {
+		net_device->recv_buf = vzalloc(buf_size);
+	}
+
 	if (!net_device->recv_buf) {
 		netdev_err(ndev,
 			   "unable to allocate receive buffer of size %u\n",
@@ -357,8 +396,6 @@  static int netvsc_init_buf(struct hv_device *device,
 		goto cleanup;
 	}
 
-	net_device->recv_buf_size = buf_size;
-
 	/*
 	 * Establish the gpadl handle for this buffer on this
 	 * channel.  Note: This call uses the vmbus connection rather
@@ -373,6 +410,19 @@  static int netvsc_init_buf(struct hv_device *device,
 		goto cleanup;
 	}
 
+	if (net_device->recv_sgt) {
+		net_device->recv_buf =
+			dma_vmap_noncontiguous(&device->device, buf_size,
+					       net_device->recv_sgt);
+		if (!net_device->recv_buf) {
+			pr_err("Fail to vmap recv buffer.\n");
+			ret = -ENOMEM;
+			goto cleanup;
+		}
+	}
+
+	net_device->recv_buf_size = buf_size;
+
 	/* Notify the NetVsp of the gpadl handle */
 	init_packet = &net_device->channel_init_pkt;
 	memset(init_packet, 0, sizeof(struct nvsp_message));
@@ -454,14 +504,27 @@  static int netvsc_init_buf(struct hv_device *device,
 	buf_size = device_info->send_sections * device_info->send_section_size;
 	buf_size = round_up(buf_size, PAGE_SIZE);
 
-	net_device->send_buf = vzalloc(buf_size);
+	if (hv_isolation_type_snp()) {
+		net_device->send_sgt =
+			dma_alloc_noncontiguous(&device->device, buf_size,
+						DMA_TO_DEVICE, GFP_KERNEL, 0);
+		if (!net_device->send_sgt) {
+			pr_err("Fail to allocate send buffer buf_size %d.\n.", buf_size);
+			ret = -ENOMEM;
+			goto cleanup;
+		}
+
+		net_device->send_buf = (void *)net_device->send_sgt->sgl->dma_address;
+	} else {
+		net_device->send_buf = vzalloc(buf_size);
+	}
+
 	if (!net_device->send_buf) {
 		netdev_err(ndev, "unable to allocate send buffer of size %u\n",
 			   buf_size);
 		ret = -ENOMEM;
 		goto cleanup;
 	}
-	net_device->send_buf_size = buf_size;
 
 	/* Establish the gpadl handle for this buffer on this
 	 * channel.  Note: This call uses the vmbus connection rather
@@ -476,6 +539,19 @@  static int netvsc_init_buf(struct hv_device *device,
 		goto cleanup;
 	}
 
+	if (net_device->send_sgt) {
+		net_device->send_buf =
+			dma_vmap_noncontiguous(&device->device, buf_size,
+					       net_device->send_sgt);
+		if (!net_device->send_buf) {
+			pr_err("Fail to vmap send buffer.\n");
+			ret = -ENOMEM;
+			goto cleanup;
+		}
+	}
+
+	net_device->send_buf_size = buf_size;
+
 	/* Notify the NetVsp of the gpadl handle */
 	init_packet = &net_device->channel_init_pkt;
 	memset(init_packet, 0, sizeof(struct nvsp_message));
@@ -766,7 +842,7 @@  static void netvsc_send_tx_complete(struct net_device *ndev,
 
 	/* Notify the layer above us */
 	if (likely(skb)) {
-		const struct hv_netvsc_packet *packet
+		struct hv_netvsc_packet *packet
 			= (struct hv_netvsc_packet *)skb->cb;
 		u32 send_index = packet->send_buf_index;
 		struct netvsc_stats *tx_stats;
@@ -782,6 +858,7 @@  static void netvsc_send_tx_complete(struct net_device *ndev,
 		tx_stats->bytes += packet->total_bytes;
 		u64_stats_update_end(&tx_stats->syncp);
 
+		netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
 		napi_consume_skb(skb, budget);
 	}
 
@@ -946,6 +1023,87 @@  static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 		memset(dest, 0, padding);
 }
 
+void netvsc_dma_unmap(struct hv_device *hv_dev,
+		      struct hv_netvsc_packet *packet)
+{
+	u32 page_count = packet->cp_partial ?
+		packet->page_buf_cnt - packet->rmsg_pgcnt :
+		packet->page_buf_cnt;
+	int i;
+
+	if (!hv_is_isolation_supported())
+		return;
+
+	if (!packet->dma_range)
+		return;
+
+	for (i = 0; i < page_count; i++)
+		dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
+				 packet->dma_range[i].mapping_size,
+				 DMA_TO_DEVICE);
+
+	kfree(packet->dma_range);
+}
+
+/* netvsc_dma_map - Map swiotlb bounce buffer with data page of
+ * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
+ * VM.
+ *
+ * In isolation VM, netvsc send buffer has been marked visible to
+ * host and so the data copied to send buffer doesn't need to use
+ * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
+ * may not be copied to send buffer and so these pages need to be
+ * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
+ * that. The pfns in the struct hv_page_buffer need to be converted
+ * to bounce buffer's pfn. The loop here is necessary because the
+ * entries in the page buffer array are not necessarily full
+ * pages of data.  Each entry in the array has a separate offset and
+ * len that may be non-zero, even for entries in the middle of the
+ * array.  And the entries are not physically contiguous.  So each
+ * entry must be individually mapped rather than as a contiguous unit.
+ * So not use dma_map_sg() here.
+ */
+static int netvsc_dma_map(struct hv_device *hv_dev,
+			  struct hv_netvsc_packet *packet,
+			  struct hv_page_buffer *pb)
+{
+	u32 page_count =  packet->cp_partial ?
+		packet->page_buf_cnt - packet->rmsg_pgcnt :
+		packet->page_buf_cnt;
+	dma_addr_t dma;
+	int i;
+
+	if (!hv_is_isolation_supported())
+		return 0;
+
+	packet->dma_range = kcalloc(page_count,
+				    sizeof(*packet->dma_range),
+				    GFP_KERNEL);
+	if (!packet->dma_range)
+		return -ENOMEM;
+
+	for (i = 0; i < page_count; i++) {
+		char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)
+					 + pb[i].offset);
+		u32 len = pb[i].len;
+
+		dma = dma_map_single(&hv_dev->device, src, len,
+				     DMA_TO_DEVICE);
+		if (dma_mapping_error(&hv_dev->device, dma)) {
+			kfree(packet->dma_range);
+			return -ENOMEM;
+		}
+
+		packet->dma_range[i].dma = dma;
+		packet->dma_range[i].mapping_size = len;
+		pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
+		pb[i].offset = offset_in_hvpage(dma);
+		pb[i].len = len;
+	}
+
+	return 0;
+}
+
 static inline int netvsc_send_pkt(
 	struct hv_device *device,
 	struct hv_netvsc_packet *packet,
@@ -986,14 +1144,24 @@  static inline int netvsc_send_pkt(
 
 	trace_nvsp_send_pkt(ndev, out_channel, rpkt);
 
+	packet->dma_range = NULL;
 	if (packet->page_buf_cnt) {
 		if (packet->cp_partial)
 			pb += packet->rmsg_pgcnt;
 
+		ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb);
+		if (ret) {
+			ret = -EAGAIN;
+			goto exit;
+		}
+
 		ret = vmbus_sendpacket_pagebuffer(out_channel,
 						  pb, packet->page_buf_cnt,
 						  &nvmsg, sizeof(nvmsg),
 						  req_id);
+
+		if (ret)
+			netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
 	} else {
 		ret = vmbus_sendpacket(out_channel,
 				       &nvmsg, sizeof(nvmsg),
@@ -1001,6 +1169,7 @@  static inline int netvsc_send_pkt(
 				       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	}
 
+exit:
 	if (ret == 0) {
 		atomic_inc_return(&nvchan->queue_sends);
 
@@ -1515,13 +1684,6 @@  static int netvsc_process_raw_pkt(struct hv_device *device,
 	return 0;
 }
 
-static struct hv_device *netvsc_channel_to_device(struct vmbus_channel *channel)
-{
-	struct vmbus_channel *primary = channel->primary_channel;
-
-	return primary ? primary->device_obj : channel->device_obj;
-}
-
 /* Network processing softirq
  * Process data in incoming ring buffer from host
  * Stops when ring is empty or budget is met or exceeded.
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index f6c9c2a670f9..448fcc325ed7 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -361,6 +361,8 @@  static void rndis_filter_receive_response(struct net_device *ndev,
 			}
 		}
 
+		netvsc_dma_unmap(((struct net_device_context *)
+			netdev_priv(ndev))->device_ctx, &request->pkt);
 		complete(&request->wait_event);
 	} else {
 		netdev_err(ndev,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 4d44fb3b3f1c..8882e46d1070 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -25,6 +25,7 @@ 
 #include <linux/interrupt.h>
 #include <linux/reciprocal_div.h>
 #include <asm/hyperv-tlfs.h>
+#include <linux/dma-map-ops.h>
 
 #define MAX_PAGE_BUFFER_COUNT				32
 #define MAX_MULTIPAGE_BUFFER_COUNT			32 /* 128K */
@@ -1583,6 +1584,11 @@  struct hyperv_service_callback {
 	void (*callback)(void *context);
 };
 
+struct hv_dma_range {
+	dma_addr_t dma;
+	u32 mapping_size;
+};
+
 #define MAX_SRV_VER	0x7ffffff
 extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
 				const int *fw_version, int fw_vercnt,