diff mbox series

[RFC,v2,4/8] virtio/vsock: add transport zerocopy callback

Message ID 6f76eed7-decc-68d1-6ae7-7bafb09fdad3@sberdevices.ru (mailing list archive)
State RFC
Headers show
Series virtio/vsock: experimental zerocopy receive | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5 this patch: 0
netdev/cc_maintainers warning 1 maintainers not CCed: edumazet@google.com
netdev/build_clang fail Errors and warnings before: 8 this patch: 13
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 7 this patch: 11
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Arseniy Krasnov June 3, 2022, 5:37 a.m. UTC
This adds transport callback which processes rx
queue of socket and instead of copying data to
user provided buffer, it inserts data pages of
each packet to user's vm area.

Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
---
 include/linux/virtio_vsock.h            |   4 +
 include/uapi/linux/virtio_vsock.h       |   6 +
 net/vmw_vsock/virtio_transport_common.c | 208 +++++++++++++++++++++++-
 3 files changed, 215 insertions(+), 3 deletions(-)

Comments

Stefano Garzarella June 9, 2022, 8:41 a.m. UTC | #1
On Fri, Jun 03, 2022 at 05:37:48AM +0000, Arseniy Krasnov wrote:
>This adds transport callback which processes rx
>queue of socket and instead of copying data to
>user provided buffer, it inserts data pages of
>each packet to user's vm area.
>
>Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>---
> include/linux/virtio_vsock.h            |   4 +
> include/uapi/linux/virtio_vsock.h       |   6 +
> net/vmw_vsock/virtio_transport_common.c | 208 +++++++++++++++++++++++-
> 3 files changed, 215 insertions(+), 3 deletions(-)
>
>diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>index d02cb7aa922f..47a68a2ea838 100644
>--- a/include/linux/virtio_vsock.h
>+++ b/include/linux/virtio_vsock.h
>@@ -51,6 +51,7 @@ struct virtio_vsock_pkt {
> 	bool reply;
> 	bool tap_delivered;
> 	bool slab_buf;
>+	bool split;
> };
>
> struct virtio_vsock_pkt_info {
>@@ -131,6 +132,9 @@ int virtio_transport_dgram_bind(struct vsock_sock *vsk,
> 				struct sockaddr_vm *addr);
> bool virtio_transport_dgram_allow(u32 cid, u32 port);
>
>+int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
>+				      struct vm_area_struct *vma,
>+				      unsigned long addr);
> int virtio_transport_connect(struct vsock_sock *vsk);
>
> int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
>diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
>index 64738838bee5..6775c6c44b5b 100644
>--- a/include/uapi/linux/virtio_vsock.h
>+++ b/include/uapi/linux/virtio_vsock.h
>@@ -66,6 +66,12 @@ struct virtio_vsock_hdr {
> 	__le32	fwd_cnt;
> } __attribute__((packed));
>
>+struct virtio_vsock_usr_hdr {
>+	u32 flags;
>+	u32 len;
>+	u32 copy_len;
>+} __attribute__((packed));
>+
> enum virtio_vsock_type {
> 	VIRTIO_VSOCK_TYPE_STREAM = 1,
> 	VIRTIO_VSOCK_TYPE_SEQPACKET = 2,
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index 278567f748f2..3a3e84176c75 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -12,6 +12,7 @@
> #include <linux/ctype.h>
> #include <linux/list.h>
> #include <linux/virtio_vsock.h>
>+#include <linux/mm.h>
> #include <uapi/linux/vsockmon.h>
>
> #include <net/sock.h>
>@@ -347,6 +348,196 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
> 	return err;
> }
>
>+#define MAX_PAGES_TO_MAP 256
>+
>+int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
>+				      struct vm_area_struct *vma,
>+				      unsigned long addr)
>+{
>+	struct virtio_vsock_sock *vvs = vsk->trans;
>+	struct virtio_vsock_usr_hdr *usr_hdr_buffer;
>+	unsigned long max_pages_to_insert;
>+	unsigned long tmp_pages_inserted;
>+	unsigned long pages_to_insert;
>+	struct page *usr_hdr_page;
>+	unsigned long vma_size;
>+	struct page **pages;
>+	int max_vma_pages;
>+	int max_usr_hdrs;
>+	int res;
>+	int err;
>+	int i;
>+
>+	/* Only use VMA from first page. */
>+	if (vma->vm_start != addr)
>+		return -EFAULT;
>+
>+	vma_size = vma->vm_end - vma->vm_start;
>+
>+	/* Too small vma(at least one page for headers
>+	 * and one page for data).
>+	 */
>+	if (vma_size < 2 * PAGE_SIZE)
>+		return -EFAULT;
>+
>+	/* Page for meta data. */
>+	usr_hdr_page = alloc_page(GFP_KERNEL);

I think all these checks should be done in af_vsock.c.

It would be nice to avoid that every transport reimplements the same 
thing and especially that all transports have the same behavior.

If you can would be nice to have the transports to return an array of 
pages to map, and af_vsock will handle it and the usr_hdr_page.

Do you think it's doable?

>+
>+	if (!usr_hdr_page)
>+		return -EFAULT;
>+
>+	pages = kmalloc_array(MAX_PAGES_TO_MAP, sizeof(pages[0]), GFP_KERNEL);
>+
>+	if (!pages)
>+		return -EFAULT;
>+
>+	pages[pages_to_insert++] = usr_hdr_page;
>+
>+	usr_hdr_buffer = page_to_virt(usr_hdr_page);
>+
>+	err = 0;
>+
>+	/* As we use first page for headers, so total number of
>+	 * pages for user is min between number of headers in
>+	 * first page and size of vma(in pages, except first page).
>+	 */
>+	max_usr_hdrs = PAGE_SIZE / sizeof(*usr_hdr_buffer);
>+	max_vma_pages = (vma_size / PAGE_SIZE) - 1;
>+	max_pages_to_insert = min(max_usr_hdrs, max_vma_pages);
>+
>+	if (max_pages_to_insert > MAX_PAGES_TO_MAP)
>+		max_pages_to_insert = MAX_PAGES_TO_MAP;
>+
>+	spin_lock_bh(&vvs->rx_lock);
>+
>+	while (!list_empty(&vvs->rx_queue) &&
>+	       pages_to_insert < max_pages_to_insert) {
>+		struct virtio_vsock_pkt *pkt;
>+		ssize_t rest_data_bytes;
>+		size_t moved_data_bytes;
>+		unsigned long pg_offs;
>+
>+		pkt = list_first_entry(&vvs->rx_queue,
>+				       struct virtio_vsock_pkt, list);
>+
>+		/* Buffer was allocated by 'kmalloc()'. This could
>+		 * happen, when zerocopy was enabled, but we still
>+		 * have pending packet which was created before it.
>+		 */
>+		if (pkt->slab_buf) {
>+			usr_hdr_buffer->flags = le32_to_cpu(pkt->hdr.flags);
>+			usr_hdr_buffer->len = 0;
>+			usr_hdr_buffer->copy_len = le32_to_cpu(pkt->hdr.len);
>+			/* Report user to read it using copy. */

Is it a "to do"?

>+			break;
>+		}
>+
>+		/* This could happen, when packet was dequeued before
>+		 * by an ordinary 'read()' call. We can't handle such
>+		 * packet. Drop it.

We can't drop packets.
I think we should allow to enable/disable this new feature before the 
connection.

>+		 */
>+		if (pkt->off % PAGE_SIZE) {
>+			list_del(&pkt->list);
>+			virtio_transport_dec_rx_pkt(vvs, pkt);
>+			virtio_transport_free_pkt(pkt);
>+			continue;
>+		}
>+
>+		rest_data_bytes = le32_to_cpu(pkt->hdr.len) - pkt->off;
>+
>+		/* For packets, bigger than one page, split it's
>+		 * high order allocated buffer to 0 order pages.
>+		 * Otherwise 'vm_insert_pages()' will fail, for
>+		 * all pages except first.
>+		 */
>+		if (rest_data_bytes > PAGE_SIZE) {
>+			/* High order buffer not split yet. */
>+			if (!pkt->split) {
>+				split_page(virt_to_page(pkt->buf),
>+					   get_order(le32_to_cpu(pkt->hdr.len)));
>+				pkt->split = true;
>+			}
>+		}
>+
>+		pg_offs = pkt->off;
>+		moved_data_bytes = 0;
>+
>+		while (rest_data_bytes &&
>+		       pages_to_insert < max_pages_to_insert) {
>+			struct page *buf_page;
>+
>+			buf_page = virt_to_page(pkt->buf + pg_offs);
>+
>+			pages[pages_to_insert++] = buf_page;
>+			/* Get reference to prevent this page being
>+			 * returned to page allocator when packet will
>+			 * be freed. Ref count will be 2.
>+			 */
>+			get_page(buf_page);
>+			pg_offs += PAGE_SIZE;
>+
>+			if (rest_data_bytes >= PAGE_SIZE) {
>+				moved_data_bytes += PAGE_SIZE;
>+				rest_data_bytes -= PAGE_SIZE;
>+			} else {
>+				moved_data_bytes += rest_data_bytes;
>+				rest_data_bytes = 0;
>+			}
>+		}
>+
>+		usr_hdr_buffer->flags = le32_to_cpu(pkt->hdr.flags);
>+		usr_hdr_buffer->len = moved_data_bytes;
>+		usr_hdr_buffer->copy_len = 0;
>+		usr_hdr_buffer++;
>+
>+		pkt->off = pg_offs;
>+
>+		if (rest_data_bytes == 0) {
>+			list_del(&pkt->list);
>+			virtio_transport_dec_rx_pkt(vvs, pkt);
>+			virtio_transport_free_pkt(pkt);
>+		}
>+
>+		/* Now ref count for all pages of packet is 1. */
>+	}
>+
>+	/* Set last buffer empty(if we have one). */
>+	if (pages_to_insert - 1 < max_usr_hdrs)
>+		usr_hdr_buffer->len = 0;
>+
>+	spin_unlock_bh(&vvs->rx_lock);
>+
>+	tmp_pages_inserted = pages_to_insert;
>+
>+	res = vm_insert_pages(vma, addr, pages, &tmp_pages_inserted);
>+
>+	if (res || tmp_pages_inserted) {
>+		/* Failed to insert some pages, we have "partially"
>+		 * mapped vma. Do not return, set error code. This
>+		 * code will be returned to user. User needs to call
>+		 * 'madvise()/mmap()' to clear this vma. Anyway,
>+		 * references to all pages will to be dropped below.
>+		 */
>+		err = -EFAULT;
>+	}
>+
>+	/* Put reference for every page. */
>+	for (i = 0; i < pages_to_insert; i++) {
>+		/* Ref count is 2 ('get_page()' + 'vm_insert_pages()' above).
>+		 * Put reference once, page will be returned to allocator
>+		 * after user's 'madvice()/munmap()' call(or it wasn't mapped
>+		 * if 'vm_insert_pages()' failed).
>+		 */
>+		put_page(pages[i]);
>+	}
>+
>+	virtio_transport_send_credit_update(vsk);
>+	kfree(pages);
>+
>+	return err;
>+}
>+EXPORT_SYMBOL_GPL(virtio_transport_zerocopy_dequeue);
>+
> static ssize_t
> virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
> 				   struct msghdr *msg,
>@@ -1344,10 +1535,21 @@ EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
> void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
> {
> 	if (pkt->buf_len) {
>-		if (pkt->slab_buf)
>+		if (pkt->slab_buf) {
> 			kfree(pkt->buf);
>-		else
>-			free_pages(buf, get_order(pkt->buf_len));
>+		} else {
>+			unsigned int order = get_order(pkt->buf_len);
>+			unsigned long buf = (unsigned long)pkt->buf;
>+
>+			if (pkt->split) {
>+				int i;
>+
>+				for (i = 0; i < (1 << order); i++)
>+					free_page(buf + i * PAGE_SIZE);
>+			} else {
>+				free_pages(buf, order);
>+			}
>+		}
> 	}
>
> 	kfree(pkt);
>-- 2.25.1
Arseniy Krasnov June 9, 2022, 12:23 p.m. UTC | #2
On 09.06.2022 11:41, Stefano Garzarella wrote:
> On Fri, Jun 03, 2022 at 05:37:48AM +0000, Arseniy Krasnov wrote:
>> This adds transport callback which processes rx
>> queue of socket and instead of copying data to
>> user provided buffer, it inserts data pages of
>> each packet to user's vm area.
>>
>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>> ---
>> include/linux/virtio_vsock.h            |   4 +
>> include/uapi/linux/virtio_vsock.h       |   6 +
>> net/vmw_vsock/virtio_transport_common.c | 208 +++++++++++++++++++++++-
>> 3 files changed, 215 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> index d02cb7aa922f..47a68a2ea838 100644
>> --- a/include/linux/virtio_vsock.h
>> +++ b/include/linux/virtio_vsock.h
>> @@ -51,6 +51,7 @@ struct virtio_vsock_pkt {
>>     bool reply;
>>     bool tap_delivered;
>>     bool slab_buf;
>> +    bool split;
>> };
>>
>> struct virtio_vsock_pkt_info {
>> @@ -131,6 +132,9 @@ int virtio_transport_dgram_bind(struct vsock_sock *vsk,
>>                 struct sockaddr_vm *addr);
>> bool virtio_transport_dgram_allow(u32 cid, u32 port);
>>
>> +int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
>> +                      struct vm_area_struct *vma,
>> +                      unsigned long addr);
>> int virtio_transport_connect(struct vsock_sock *vsk);
>>
>> int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
>> diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
>> index 64738838bee5..6775c6c44b5b 100644
>> --- a/include/uapi/linux/virtio_vsock.h
>> +++ b/include/uapi/linux/virtio_vsock.h
>> @@ -66,6 +66,12 @@ struct virtio_vsock_hdr {
>>     __le32    fwd_cnt;
>> } __attribute__((packed));
>>
>> +struct virtio_vsock_usr_hdr {
>> +    u32 flags;
>> +    u32 len;
>> +    u32 copy_len;
>> +} __attribute__((packed));
>> +
>> enum virtio_vsock_type {
>>     VIRTIO_VSOCK_TYPE_STREAM = 1,
>>     VIRTIO_VSOCK_TYPE_SEQPACKET = 2,
>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>> index 278567f748f2..3a3e84176c75 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -12,6 +12,7 @@
>> #include <linux/ctype.h>
>> #include <linux/list.h>
>> #include <linux/virtio_vsock.h>
>> +#include <linux/mm.h>
>> #include <uapi/linux/vsockmon.h>
>>
>> #include <net/sock.h>
>> @@ -347,6 +348,196 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
>>     return err;
>> }
>>
>> +#define MAX_PAGES_TO_MAP 256
>> +
>> +int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
>> +                      struct vm_area_struct *vma,
>> +                      unsigned long addr)
>> +{
>> +    struct virtio_vsock_sock *vvs = vsk->trans;
>> +    struct virtio_vsock_usr_hdr *usr_hdr_buffer;
>> +    unsigned long max_pages_to_insert;
>> +    unsigned long tmp_pages_inserted;
>> +    unsigned long pages_to_insert;
>> +    struct page *usr_hdr_page;
>> +    unsigned long vma_size;
>> +    struct page **pages;
>> +    int max_vma_pages;
>> +    int max_usr_hdrs;
>> +    int res;
>> +    int err;
>> +    int i;
>> +
>> +    /* Only use VMA from first page. */
>> +    if (vma->vm_start != addr)
>> +        return -EFAULT;
>> +
>> +    vma_size = vma->vm_end - vma->vm_start;
>> +
>> +    /* Too small vma(at least one page for headers
>> +     * and one page for data).
>> +     */
>> +    if (vma_size < 2 * PAGE_SIZE)
>> +        return -EFAULT;
>> +
>> +    /* Page for meta data. */
>> +    usr_hdr_page = alloc_page(GFP_KERNEL);
> 
> I think all these checks should be done in af_vsock.c> 
> It would be nice to avoid that every transport reimplements the same thing and especially that all transports have the same behavior.
> 
> If you can would be nice to have the transports to return an array of pages to map, and af_vsock will handle it and the usr_hdr_page.
> 
> Do you think it's doable?
Yes, as we talked in patch above, this part could be common.
> 
>> +
>> +    if (!usr_hdr_page)
>> +        return -EFAULT;
>> +
>> +    pages = kmalloc_array(MAX_PAGES_TO_MAP, sizeof(pages[0]), GFP_KERNEL);
>> +
>> +    if (!pages)
>> +        return -EFAULT;
>> +
>> +    pages[pages_to_insert++] = usr_hdr_page;
>> +
>> +    usr_hdr_buffer = page_to_virt(usr_hdr_page);
>> +
>> +    err = 0;
>> +
>> +    /* As we use first page for headers, so total number of
>> +     * pages for user is min between number of headers in
>> +     * first page and size of vma(in pages, except first page).
>> +     */
>> +    max_usr_hdrs = PAGE_SIZE / sizeof(*usr_hdr_buffer);
>> +    max_vma_pages = (vma_size / PAGE_SIZE) - 1;
>> +    max_pages_to_insert = min(max_usr_hdrs, max_vma_pages);
>> +
>> +    if (max_pages_to_insert > MAX_PAGES_TO_MAP)
>> +        max_pages_to_insert = MAX_PAGES_TO_MAP;
>> +
>> +    spin_lock_bh(&vvs->rx_lock);
>> +
>> +    while (!list_empty(&vvs->rx_queue) &&
>> +           pages_to_insert < max_pages_to_insert) {
>> +        struct virtio_vsock_pkt *pkt;
>> +        ssize_t rest_data_bytes;
>> +        size_t moved_data_bytes;
>> +        unsigned long pg_offs;
>> +
>> +        pkt = list_first_entry(&vvs->rx_queue,
>> +                       struct virtio_vsock_pkt, list);
>> +
>> +        /* Buffer was allocated by 'kmalloc()'. This could
>> +         * happen, when zerocopy was enabled, but we still
>> +         * have pending packet which was created before it.
>> +         */
>> +        if (pkt->slab_buf) {
>> +            usr_hdr_buffer->flags = le32_to_cpu(pkt->hdr.flags);
>> +            usr_hdr_buffer->len = 0;
>> +            usr_hdr_buffer->copy_len = le32_to_cpu(pkt->hdr.len);
>> +            /* Report user to read it using copy. */
> 
> Is it a "to do"?
No, seems i need to move my comment under opening bracket
> 
>> +            break;
>> +        }
>> +
>> +        /* This could happen, when packet was dequeued before
>> +         * by an ordinary 'read()' call. We can't handle such
>> +         * packet. Drop it.
> 
> We can't drop packets.
> I think we should allow to enable/disable this new feature before the connection.
Yes, allow to enable/disable only in not connected state
> 
>> +         */
>> +        if (pkt->off % PAGE_SIZE) {
>> +            list_del(&pkt->list);
>> +            virtio_transport_dec_rx_pkt(vvs, pkt);
>> +            virtio_transport_free_pkt(pkt);
>> +            continue;
>> +        }
>> +
>> +        rest_data_bytes = le32_to_cpu(pkt->hdr.len) - pkt->off;
>> +
>> +        /* For packets, bigger than one page, split it's
>> +         * high order allocated buffer to 0 order pages.
>> +         * Otherwise 'vm_insert_pages()' will fail, for
>> +         * all pages except first.
>> +         */
>> +        if (rest_data_bytes > PAGE_SIZE) {
>> +            /* High order buffer not split yet. */
>> +            if (!pkt->split) {
>> +                split_page(virt_to_page(pkt->buf),
>> +                       get_order(le32_to_cpu(pkt->hdr.len)));
>> +                pkt->split = true;
>> +            }
>> +        }
>> +
>> +        pg_offs = pkt->off;
>> +        moved_data_bytes = 0;
>> +
>> +        while (rest_data_bytes &&
>> +               pages_to_insert < max_pages_to_insert) {
>> +            struct page *buf_page;
>> +
>> +            buf_page = virt_to_page(pkt->buf + pg_offs);
>> +
>> +            pages[pages_to_insert++] = buf_page;
>> +            /* Get reference to prevent this page being
>> +             * returned to page allocator when packet will
>> +             * be freed. Ref count will be 2.
>> +             */
>> +            get_page(buf_page);
>> +            pg_offs += PAGE_SIZE;
>> +
>> +            if (rest_data_bytes >= PAGE_SIZE) {
>> +                moved_data_bytes += PAGE_SIZE;
>> +                rest_data_bytes -= PAGE_SIZE;
>> +            } else {
>> +                moved_data_bytes += rest_data_bytes;
>> +                rest_data_bytes = 0;
>> +            }
>> +        }
>> +
>> +        usr_hdr_buffer->flags = le32_to_cpu(pkt->hdr.flags);
>> +        usr_hdr_buffer->len = moved_data_bytes;
>> +        usr_hdr_buffer->copy_len = 0;
>> +        usr_hdr_buffer++;
>> +
>> +        pkt->off = pg_offs;
>> +
>> +        if (rest_data_bytes == 0) {
>> +            list_del(&pkt->list);
>> +            virtio_transport_dec_rx_pkt(vvs, pkt);
>> +            virtio_transport_free_pkt(pkt);
>> +        }
>> +
>> +        /* Now ref count for all pages of packet is 1. */
>> +    }
>> +
>> +    /* Set last buffer empty(if we have one). */
>> +    if (pages_to_insert - 1 < max_usr_hdrs)
>> +        usr_hdr_buffer->len = 0;
>> +
>> +    spin_unlock_bh(&vvs->rx_lock);
>> +
>> +    tmp_pages_inserted = pages_to_insert;
>> +
>> +    res = vm_insert_pages(vma, addr, pages, &tmp_pages_inserted);
>> +
>> +    if (res || tmp_pages_inserted) {
>> +        /* Failed to insert some pages, we have "partially"
>> +         * mapped vma. Do not return, set error code. This
>> +         * code will be returned to user. User needs to call
>> +         * 'madvise()/mmap()' to clear this vma. Anyway,
>> +         * references to all pages will to be dropped below.
>> +         */
>> +        err = -EFAULT;
>> +    }
>> +
>> +    /* Put reference for every page. */
>> +    for (i = 0; i < pages_to_insert; i++) {
>> +        /* Ref count is 2 ('get_page()' + 'vm_insert_pages()' above).
>> +         * Put reference once, page will be returned to allocator
>> +         * after user's 'madvice()/munmap()' call(or it wasn't mapped
>> +         * if 'vm_insert_pages()' failed).
>> +         */
>> +        put_page(pages[i]);
>> +    }
>> +
>> +    virtio_transport_send_credit_update(vsk);
>> +    kfree(pages);
>> +
>> +    return err;
>> +}
>> +EXPORT_SYMBOL_GPL(virtio_transport_zerocopy_dequeue);
>> +
>> static ssize_t
>> virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
>>                    struct msghdr *msg,
>> @@ -1344,10 +1535,21 @@ EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
>> void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>> {
>>     if (pkt->buf_len) {
>> -        if (pkt->slab_buf)
>> +        if (pkt->slab_buf) {
>>             kfree(pkt->buf);
>> -        else
>> -            free_pages(buf, get_order(pkt->buf_len));
>> +        } else {
>> +            unsigned int order = get_order(pkt->buf_len);
>> +            unsigned long buf = (unsigned long)pkt->buf;
>> +
>> +            if (pkt->split) {
>> +                int i;
>> +
>> +                for (i = 0; i < (1 << order); i++)
>> +                    free_page(buf + i * PAGE_SIZE);
>> +            } else {
>> +                free_pages(buf, order);
>> +            }
>> +        }
>>     }
>>
>>     kfree(pkt);
>> -- 2.25.1
>
diff mbox series

Patch

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index d02cb7aa922f..47a68a2ea838 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -51,6 +51,7 @@  struct virtio_vsock_pkt {
 	bool reply;
 	bool tap_delivered;
 	bool slab_buf;
+	bool split;
 };
 
 struct virtio_vsock_pkt_info {
@@ -131,6 +132,9 @@  int virtio_transport_dgram_bind(struct vsock_sock *vsk,
 				struct sockaddr_vm *addr);
 bool virtio_transport_dgram_allow(u32 cid, u32 port);
 
+int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
+				      struct vm_area_struct *vma,
+				      unsigned long addr);
 int virtio_transport_connect(struct vsock_sock *vsk);
 
 int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
index 64738838bee5..6775c6c44b5b 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -66,6 +66,12 @@  struct virtio_vsock_hdr {
 	__le32	fwd_cnt;
 } __attribute__((packed));
 
+struct virtio_vsock_usr_hdr {
+	u32 flags;
+	u32 len;
+	u32 copy_len;
+} __attribute__((packed));
+
 enum virtio_vsock_type {
 	VIRTIO_VSOCK_TYPE_STREAM = 1,
 	VIRTIO_VSOCK_TYPE_SEQPACKET = 2,
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 278567f748f2..3a3e84176c75 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -12,6 +12,7 @@ 
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/virtio_vsock.h>
+#include <linux/mm.h>
 #include <uapi/linux/vsockmon.h>
 
 #include <net/sock.h>
@@ -347,6 +348,196 @@  virtio_transport_stream_do_peek(struct vsock_sock *vsk,
 	return err;
 }
 
+#define MAX_PAGES_TO_MAP 256
+
+int virtio_transport_zerocopy_dequeue(struct vsock_sock *vsk,
+				      struct vm_area_struct *vma,
+				      unsigned long addr)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct virtio_vsock_usr_hdr *usr_hdr_buffer;
+	unsigned long max_pages_to_insert;
+	unsigned long tmp_pages_inserted;
+	unsigned long pages_to_insert;
+	struct page *usr_hdr_page;
+	unsigned long vma_size;
+	struct page **pages;
+	int max_vma_pages;
+	int max_usr_hdrs;
+	int res;
+	int err;
+	int i;
+
+	/* Only use VMA from first page. */
+	if (vma->vm_start != addr)
+		return -EFAULT;
+
+	vma_size = vma->vm_end - vma->vm_start;
+
+	/* Too small vma(at least one page for headers
+	 * and one page for data).
+	 */
+	if (vma_size < 2 * PAGE_SIZE)
+		return -EFAULT;
+
+	/* Page for meta data. */
+	usr_hdr_page = alloc_page(GFP_KERNEL);
+
+	if (!usr_hdr_page)
+		return -EFAULT;
+
+	pages = kmalloc_array(MAX_PAGES_TO_MAP, sizeof(pages[0]), GFP_KERNEL);
+
+	if (!pages)
+		return -EFAULT;
+
+	pages[pages_to_insert++] = usr_hdr_page;
+
+	usr_hdr_buffer = page_to_virt(usr_hdr_page);
+
+	err = 0;
+
+	/* As we use first page for headers, so total number of
+	 * pages for user is min between number of headers in
+	 * first page and size of vma(in pages, except first page).
+	 */
+	max_usr_hdrs = PAGE_SIZE / sizeof(*usr_hdr_buffer);
+	max_vma_pages = (vma_size / PAGE_SIZE) - 1;
+	max_pages_to_insert = min(max_usr_hdrs, max_vma_pages);
+
+	if (max_pages_to_insert > MAX_PAGES_TO_MAP)
+		max_pages_to_insert = MAX_PAGES_TO_MAP;
+
+	spin_lock_bh(&vvs->rx_lock);
+
+	while (!list_empty(&vvs->rx_queue) &&
+	       pages_to_insert < max_pages_to_insert) {
+		struct virtio_vsock_pkt *pkt;
+		ssize_t rest_data_bytes;
+		size_t moved_data_bytes;
+		unsigned long pg_offs;
+
+		pkt = list_first_entry(&vvs->rx_queue,
+				       struct virtio_vsock_pkt, list);
+
+		/* Buffer was allocated by 'kmalloc()'. This could
+		 * happen, when zerocopy was enabled, but we still
+		 * have pending packet which was created before it.
+		 */
+		if (pkt->slab_buf) {
+			usr_hdr_buffer->flags = le32_to_cpu(pkt->hdr.flags);
+			usr_hdr_buffer->len = 0;
+			usr_hdr_buffer->copy_len = le32_to_cpu(pkt->hdr.len);
+			/* Report user to read it using copy. */
+			break;
+		}
+
+		/* This could happen, when packet was dequeued before
+		 * by an ordinary 'read()' call. We can't handle such
+		 * packet. Drop it.
+		 */
+		if (pkt->off % PAGE_SIZE) {
+			list_del(&pkt->list);
+			virtio_transport_dec_rx_pkt(vvs, pkt);
+			virtio_transport_free_pkt(pkt);
+			continue;
+		}
+
+		rest_data_bytes = le32_to_cpu(pkt->hdr.len) - pkt->off;
+
+		/* For packets, bigger than one page, split it's
+		 * high order allocated buffer to 0 order pages.
+		 * Otherwise 'vm_insert_pages()' will fail, for
+		 * all pages except first.
+		 */
+		if (rest_data_bytes > PAGE_SIZE) {
+			/* High order buffer not split yet. */
+			if (!pkt->split) {
+				split_page(virt_to_page(pkt->buf),
+					   get_order(le32_to_cpu(pkt->hdr.len)));
+				pkt->split = true;
+			}
+		}
+
+		pg_offs = pkt->off;
+		moved_data_bytes = 0;
+
+		while (rest_data_bytes &&
+		       pages_to_insert < max_pages_to_insert) {
+			struct page *buf_page;
+
+			buf_page = virt_to_page(pkt->buf + pg_offs);
+
+			pages[pages_to_insert++] = buf_page;
+			/* Get reference to prevent this page being
+			 * returned to page allocator when packet will
+			 * be freed. Ref count will be 2.
+			 */
+			get_page(buf_page);
+			pg_offs += PAGE_SIZE;
+
+			if (rest_data_bytes >= PAGE_SIZE) {
+				moved_data_bytes += PAGE_SIZE;
+				rest_data_bytes -= PAGE_SIZE;
+			} else {
+				moved_data_bytes += rest_data_bytes;
+				rest_data_bytes = 0;
+			}
+		}
+
+		usr_hdr_buffer->flags = le32_to_cpu(pkt->hdr.flags);
+		usr_hdr_buffer->len = moved_data_bytes;
+		usr_hdr_buffer->copy_len = 0;
+		usr_hdr_buffer++;
+
+		pkt->off = pg_offs;
+
+		if (rest_data_bytes == 0) {
+			list_del(&pkt->list);
+			virtio_transport_dec_rx_pkt(vvs, pkt);
+			virtio_transport_free_pkt(pkt);
+		}
+
+		/* Now ref count for all pages of packet is 1. */
+	}
+
+	/* Set last buffer empty(if we have one). */
+	if (pages_to_insert - 1 < max_usr_hdrs)
+		usr_hdr_buffer->len = 0;
+
+	spin_unlock_bh(&vvs->rx_lock);
+
+	tmp_pages_inserted = pages_to_insert;
+
+	res = vm_insert_pages(vma, addr, pages, &tmp_pages_inserted);
+
+	if (res || tmp_pages_inserted) {
+		/* Failed to insert some pages, we have "partially"
+		 * mapped vma. Do not return, set error code. This
+		 * code will be returned to user. User needs to call
+		 * 'madvise()/mmap()' to clear this vma. Anyway,
+		 * references to all pages will to be dropped below.
+		 */
+		err = -EFAULT;
+	}
+
+	/* Put reference for every page. */
+	for (i = 0; i < pages_to_insert; i++) {
+		/* Ref count is 2 ('get_page()' + 'vm_insert_pages()' above).
+		 * Put reference once, page will be returned to allocator
+		 * after user's 'madvice()/munmap()' call(or it wasn't mapped
+		 * if 'vm_insert_pages()' failed).
+		 */
+		put_page(pages[i]);
+	}
+
+	virtio_transport_send_credit_update(vsk);
+	kfree(pages);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_zerocopy_dequeue);
+
 static ssize_t
 virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 				   struct msghdr *msg,
@@ -1344,10 +1535,21 @@  EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
 void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
 {
 	if (pkt->buf_len) {
-		if (pkt->slab_buf)
+		if (pkt->slab_buf) {
 			kfree(pkt->buf);
-		else
-			free_pages(buf, get_order(pkt->buf_len));
+		} else {
+			unsigned int order = get_order(pkt->buf_len);
+			unsigned long buf = (unsigned long)pkt->buf;
+
+			if (pkt->split) {
+				int i;
+
+				for (i = 0; i < (1 << order); i++)
+					free_page(buf + i * PAGE_SIZE);
+			} else {
+				free_pages(buf, order);
+			}
+		}
 	}
 
 	kfree(pkt);