diff mbox series

[v3,1/5] net: add header len parameter to tun_get_socket(), tap_get_socket()

Message ID 20210624123005.1301761-1-dwmw2@infradead.org (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [v3,1/5] net: add header len parameter to tun_get_socket(), tap_get_socket() | expand

Checks

Context Check Description
netdev/cover_letter warning Series does not have a cover letter
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix success Link
netdev/cc_maintainers fail 2 blamed authors not CCed: mst@redhat.com dlstevens@us.ibm.com; 6 maintainers not CCed: kvm@vger.kernel.org mst@redhat.com dlstevens@us.ibm.com virtualization@lists.linux-foundation.org davem@davemloft.net kuba@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 1 this patch: 1
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 186 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 1 this patch: 1
netdev/header_inline success Link

Commit Message

David Woodhouse June 24, 2021, 12:30 p.m. UTC
From: David Woodhouse <dwmw@amazon.co.uk>

The vhost-net driver was making wild assumptions about the header length
of the underlying tun/tap socket. Then it was discarding packets if
the number of bytes it got from sock_recvmsg() didn't precisely match
its guess.

Fix it to get the correct information along with the socket itself.
As a side-effect, this means that tun_get_socket() won't work if the
tun file isn't actually connected to a device, since there's no 'tun'
yet in that case to get the information from.

On the receive side, where the tun device generates the virtio_net_hdr
but VIRITO_NET_F_MSG_RXBUF was negotiated and vhost-net needs to fill
in the 'num_buffers' field on top of the existing virtio_net_hdr, fix
that to use 'sock_hlen - 2' as the location, which means that it goes
in the right place regardless of whether the tun device is using an
additional tun_pi header or not. In this case, the user should have
configured the tun device with a vnet hdr size of 12, to make room.

Fixes: 8dd014adfea6f ("vhost-net: mergeable buffers support")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 drivers/net/tap.c      |  5 ++++-
 drivers/net/tun.c      | 16 +++++++++++++++-
 drivers/vhost/net.c    | 31 +++++++++++++++----------------
 include/linux/if_tap.h |  4 ++--
 include/linux/if_tun.h |  4 ++--
 5 files changed, 38 insertions(+), 22 deletions(-)

Comments

Jason Wang June 25, 2021, 5 a.m. UTC | #1
在 2021/6/24 下午8:30, David Woodhouse 写道:
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> The vhost-net driver was making wild assumptions about the header length
> of the underlying tun/tap socket.


It's by design to depend on the userspace to co-ordinate the vnet header 
setting with the underlying sockets.


>   Then it was discarding packets if
> the number of bytes it got from sock_recvmsg() didn't precisely match
> its guess.


Anything that is broken by this? The failure is a hint for the userspace 
that something is wrong during the coordination.


>
> Fix it to get the correct information along with the socket itself.


I'm not sure what is fixed by this. It looks to me it tires to let 
packet go even if the userspace set the wrong attributes to tap or 
vhost. This is even sub-optimal than failing explicitly fail the RX.


> As a side-effect, this means that tun_get_socket() won't work if the
> tun file isn't actually connected to a device, since there's no 'tun'
> yet in that case to get the information from.


This may break the existing application. Vhost-net is tied to the socket 
instead of the device that the socket is loosely coupled.


>
> On the receive side, where the tun device generates the virtio_net_hdr
> but VIRITO_NET_F_MSG_RXBUF was negotiated and vhost-net needs to fill
> in the 'num_buffers' field on top of the existing virtio_net_hdr, fix
> that to use 'sock_hlen - 2' as the location, which means that it goes
> in the right place regardless of whether the tun device is using an
> additional tun_pi header or not. In this case, the user should have
> configured the tun device with a vnet hdr size of 12, to make room.
>
> Fixes: 8dd014adfea6f ("vhost-net: mergeable buffers support")
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
>   drivers/net/tap.c      |  5 ++++-
>   drivers/net/tun.c      | 16 +++++++++++++++-
>   drivers/vhost/net.c    | 31 +++++++++++++++----------------
>   include/linux/if_tap.h |  4 ++--
>   include/linux/if_tun.h |  4 ++--
>   5 files changed, 38 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 8e3a28ba6b28..2170a0d3d34c 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -1246,7 +1246,7 @@ static const struct proto_ops tap_socket_ops = {
>    * attached to a device.  The returned object works like a packet socket, it
>    * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
>    * holding a reference to the file for as long as the socket is in use. */
> -struct socket *tap_get_socket(struct file *file)
> +struct socket *tap_get_socket(struct file *file, size_t *hlen)
>   {
>   	struct tap_queue *q;
>   	if (file->f_op != &tap_fops)
> @@ -1254,6 +1254,9 @@ struct socket *tap_get_socket(struct file *file)
>   	q = file->private_data;
>   	if (!q)
>   		return ERR_PTR(-EBADFD);
> +	if (hlen)
> +		*hlen = (q->flags & IFF_VNET_HDR) ? q->vnet_hdr_sz : 0;
> +
>   	return &q->sock;
>   }
>   EXPORT_SYMBOL_GPL(tap_get_socket);
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 4cf38be26dc9..67b406fa0881 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -3649,7 +3649,7 @@ static void tun_cleanup(void)
>    * attached to a device.  The returned object works like a packet socket, it
>    * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
>    * holding a reference to the file for as long as the socket is in use. */
> -struct socket *tun_get_socket(struct file *file)
> +struct socket *tun_get_socket(struct file *file, size_t *hlen)
>   {
>   	struct tun_file *tfile;
>   	if (file->f_op != &tun_fops)
> @@ -3657,6 +3657,20 @@ struct socket *tun_get_socket(struct file *file)
>   	tfile = file->private_data;
>   	if (!tfile)
>   		return ERR_PTR(-EBADFD);
> +
> +	if (hlen) {
> +		struct tun_struct *tun = tun_get(tfile);
> +		size_t len = 0;
> +
> +		if (!tun)
> +			return ERR_PTR(-ENOTCONN);
> +		if (tun->flags & IFF_VNET_HDR)
> +			len += READ_ONCE(tun->vnet_hdr_sz);
> +		if (!(tun->flags & IFF_NO_PI))
> +			len += sizeof(struct tun_pi);
> +		tun_put(tun);
> +		*hlen = len;
> +	}
>   	return &tfile->socket;
>   }
>   EXPORT_SYMBOL_GPL(tun_get_socket);
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index df82b124170e..b92a7144ed90 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -1143,7 +1143,8 @@ static void handle_rx(struct vhost_net *net)
>   
>   	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
>   		vq->log : NULL;
> -	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
> +	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) &&
> +		(vhost_hlen || sock_hlen >= sizeof(num_buffers));


So this change the behavior. When mergeable buffer is enabled, userspace 
expects the vhost to merge buffers. If the feature is disabled silently, 
it violates virtio spec.

If anything wrong in the setup, userspace just breaks itself.

E.g if sock_hlen is less that struct virtio_net_hdr_mrg_buf. The packet 
header might be overwrote by the vnet header.


>   
>   	do {
>   		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
> @@ -1213,9 +1214,10 @@ static void handle_rx(struct vhost_net *net)
>   			}
>   		} else {
>   			/* Header came from socket; we'll need to patch
> -			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
> +			 * ->num_buffers over the last two bytes if
> +			 * VIRTIO_NET_F_MRG_RXBUF is enabled.
>   			 */
> -			iov_iter_advance(&fixup, sizeof(hdr));
> +			iov_iter_advance(&fixup, sock_hlen - 2);


I'm not sure what did the above code want to fix. It doesn't change 
anything if vnet header is set correctly in TUN. It only prevents the 
the packet header from being rewrote.

Thanks
David Woodhouse June 25, 2021, 8:23 a.m. UTC | #2
On Fri, 2021-06-25 at 13:00 +0800, Jason Wang wrote:
> 在 2021/6/24 下午8:30, David Woodhouse 写道:
> > From: David Woodhouse <dwmw@amazon.co.uk>
> > 
> > The vhost-net driver was making wild assumptions about the header length
> > of the underlying tun/tap socket.
> 
> 
> It's by design to depend on the userspace to co-ordinate the vnet header 
> setting with the underlying sockets.
> 
> 
> >   Then it was discarding packets if
> > the number of bytes it got from sock_recvmsg() didn't precisely match
> > its guess.
> 
> 
> Anything that is broken by this? The failure is a hint for the userspace 
> that something is wrong during the coordination.

I am not a fan of this approach. I firmly believe that for a given
configuration, the kernel should either *work* or it should gracefully
refuse to set it up that way. And the requirements should be clearly
documented.

Having been on the receiving end of this "hint" of which you speak, I
found it distinctly suboptimal as a user interface. I was left
scrabbling around trying to find a set of options which *would* work,
and it was only through debugging the kernel that I managed to work out
that I:

  • MUST set IFF_NO_PI
  • MUST use TUNSETSNDBUF to reduce the sndbuf from INT_MAX
  • MUST use a virtio_net_hdr that I don't want

If my application failed to do any of those things, I got a silent
failure to transport any packets. The only thing I could do *without*
debugging the kernel was tcpdump on the 'tun0' interface and see if the
TX packets I put into the ring were even making it to the interface,
and what they looked like if they did. (Losing the first 14 bytes and
having the *next* 14 bytes scribbled on by an Ethernet header was a fun
one.)





> > 
> > Fix it to get the correct information along with the socket itself.
> 
> 
> I'm not sure what is fixed by this. It looks to me it tires to let 
> packet go even if the userspace set the wrong attributes to tap or 
> vhost. This is even sub-optimal than failing explicitly fail the RX.

I'm OK with explicit failure. But once I'd let it *get* the information
from the underlying socket in order to decide whether it should fail or
not, it turned out to be easy enough just to make those configs work
anyway.

The main case where that "easy enough" is stretched a little (IMO) was
when there's a tun_pi header. I have one more of your emails to reply
to after this, and I'll address that there.


> 
> > As a side-effect, this means that tun_get_socket() won't work if the
> > tun file isn't actually connected to a device, since there's no 'tun'
> > yet in that case to get the information from.
> 
> 
> This may break the existing application. Vhost-net is tied to the socket 
> instead of the device that the socket is loosely coupled.

Hm. Perhaps the PI and vnet hdr should be considered an option of the
*socket* (which is tied to the tfile), not purely an option of the
underlying device?

Or maybe it's sufficient just to get the flags from *either* tfile->tun 
or tfile->detached, so that it works when the queue is detached. I'll
take a look.

I suppose we could even have a fallback that makes stuff up like we do
today. If the user attempts to attach a tun file descriptor to vhost
without ever calling TUNSETIFF on it first, *then* we make the same
assumptions we do today?

> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -1143,7 +1143,8 @@ static void handle_rx(struct vhost_net *net)
> >   
> >   	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
> >   		vq->log : NULL;
> > -	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
> > +	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) &&
> > +		(vhost_hlen || sock_hlen >= sizeof(num_buffers));
> 
> 
> So this change the behavior. When mergeable buffer is enabled, userspace 
> expects the vhost to merge buffers. If the feature is disabled silently, 
> it violates virtio spec.
> 
> If anything wrong in the setup, userspace just breaks itself.
> 
> E.g if sock_hlen is less that struct virtio_net_hdr_mrg_buf. The packet 
> header might be overwrote by the vnet header.

This wasn't intended to change the behaviour of any code path that is
already working today. If *either* vhost or the underlying device have
provided a vnet header, we still merge.

If *neither* provide a vnet hdr, there's nowhere to put num_buffers and
we can't merge.

That code path doesn't work at all today, but does after my patches.
But you're right, we should explicitly refuse to negotiate
VIRITO_NET_F_MSG_RXBUF in that case.

> 
> >   
> >   	do {
> >   		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
> > @@ -1213,9 +1214,10 @@ static void handle_rx(struct vhost_net *net)
> >   			}
> >   		} else {
> >   			/* Header came from socket; we'll need to patch
> > -			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
> > +			 * ->num_buffers over the last two bytes if
> > +			 * VIRTIO_NET_F_MRG_RXBUF is enabled.
> >   			 */
> > -			iov_iter_advance(&fixup, sizeof(hdr));
> > +			iov_iter_advance(&fixup, sock_hlen - 2);
> 
> 
> I'm not sure what did the above code want to fix. It doesn't change 
> anything if vnet header is set correctly in TUN. It only prevents the 
> the packet header from being rewrote.
> 

It fixes the case where the virtio_net_hdr isn't at the start of the
tun header, because the tun actually puts the tun_pi struct *first*,
and *then* the virtio_net_hdr. 

The num_buffers field needs to go at the *end* of sock_hlen. Not at a
fixed offset from the *start* of it.

At least, that's true unless we want to just declare that we *only*
support TUN with the IFF_NO_PI flag. (qv).
Willem de Bruijn June 25, 2021, 6:13 p.m. UTC | #3
On Thu, Jun 24, 2021 at 8:30 AM David Woodhouse <dwmw2@infradead.org> wrote:
>
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> The vhost-net driver was making wild assumptions about the header length

If respinning, please more concretely describe which configuration is
currently broken.

IFF_NO_PI + IFF_VNET_HDR, if I understand correctly. But I got that
from the discussion, not the commit message.

> of the underlying tun/tap socket. Then it was discarding packets if
> the number of bytes it got from sock_recvmsg() didn't precisely match
> its guess.
>
> Fix it to get the correct information along with the socket itself.
> As a side-effect, this means that tun_get_socket() won't work if the
> tun file isn't actually connected to a device, since there's no 'tun'
> yet in that case to get the information from.
>
> On the receive side, where the tun device generates the virtio_net_hdr
> but VIRITO_NET_F_MSG_RXBUF was negotiated and vhost-net needs to fill

Nit: VIRTIO_NET_F_MSG_RXBUF

> in the 'num_buffers' field on top of the existing virtio_net_hdr, fix
> that to use 'sock_hlen - 2' as the location, which means that it goes

Please use sizeof(hdr.num_buffers) instead of a raw constant 2, to
self document the code.

Should this be an independent one-line fix?
David Woodhouse June 25, 2021, 6:55 p.m. UTC | #4
On Fri, 2021-06-25 at 14:13 -0400, Willem de Bruijn wrote:
> On Thu, Jun 24, 2021 at 8:30 AM David Woodhouse <dwmw2@infradead.org>
> wrote:
> > 
> > From: David Woodhouse <dwmw@amazon.co.uk>
> > 
> > The vhost-net driver was making wild assumptions about the header
> > length
> 
> If respinning, please more concretely describe which configuration is
> currently broken.

Fairly much all of them. Here's a test run on the 5.12.8 kernel:

$ sudo ./test_vhost_net 
TEST: (hdr 0, xdp 0, pi 0, features 0) RESULT: -1
TEST: (hdr 10, xdp 0, pi 0, features 0) RESULT: 0
TEST: (hdr 12, xdp 0, pi 0, features 0) RESULT: -1
TEST: (hdr 20, xdp 0, pi 0, features 0) RESULT: -1
TEST: (hdr 0, xdp 1, pi 0, features 0) RESULT: -1
TEST: (hdr 10, xdp 1, pi 0, features 0) RESULT: -1
TEST: (hdr 12, xdp 1, pi 0, features 0) RESULT: -1
TEST: (hdr 20, xdp 1, pi 0, features 0) RESULT: -1
TEST: (hdr 0, xdp 0, pi 1, features 0) RESULT: -1
TEST: (hdr 10, xdp 0, pi 1, features 0) RESULT: -1
TEST: (hdr 12, xdp 0, pi 1, features 0) RESULT: -1
TEST: (hdr 20, xdp 0, pi 1, features 0) RESULT: -1
TEST: (hdr 0, xdp 1, pi 1, features 0) RESULT: -1
TEST: (hdr 10, xdp 1, pi 1, features 0) RESULT: -1
TEST: (hdr 12, xdp 1, pi 1, features 0) RESULT: -1
TEST: (hdr 20, xdp 1, pi 1, features 0) RESULT: -1
TEST: (hdr 0, xdp 0, pi 0, features 100000000) RESULT: -1
TEST: (hdr 10, xdp 0, pi 0, features 100000000) RESULT: -1
TEST: (hdr 12, xdp 0, pi 0, features 100000000) RESULT: 0
TEST: (hdr 20, xdp 0, pi 0, features 100000000) RESULT: -1
TEST: (hdr 0, xdp 1, pi 0, features 100000000) RESULT: -1
TEST: (hdr 10, xdp 1, pi 0, features 100000000) RESULT: -1
TEST: (hdr 12, xdp 1, pi 0, features 100000000) RESULT: -1
TEST: (hdr 20, xdp 1, pi 0, features 100000000) RESULT: -1
TEST: (hdr 0, xdp 0, pi 1, features 100000000) RESULT: -1
TEST: (hdr 10, xdp 0, pi 1, features 100000000) RESULT: -1
TEST: (hdr 12, xdp 0, pi 1, features 100000000) RESULT: -1
TEST: (hdr 20, xdp 0, pi 1, features 100000000) RESULT: -1
TEST: (hdr 0, xdp 1, pi 1, features 100000000) RESULT: -1
TEST: (hdr 10, xdp 1, pi 1, features 100000000) RESULT: -1
TEST: (hdr 12, xdp 1, pi 1, features 100000000) RESULT: -1
TEST: (hdr 20, xdp 1, pi 1, features 100000000) RESULT: -1
TEST: (hdr 0, xdp 0, pi 0, features 8000000) RESULT: 0
TEST: (hdr 0, xdp 1, pi 0, features 8000000) RESULT: -1
TEST: (hdr 0, xdp 0, pi 1, features 8000000) RESULT: -1
TEST: (hdr 0, xdp 1, pi 1, features 8000000) RESULT: -1
TEST: (hdr 0, xdp 0, pi 0, features 108000000) RESULT: 0
TEST: (hdr 0, xdp 1, pi 0, features 108000000) RESULT: -1
TEST: (hdr 0, xdp 0, pi 1, features 108000000) RESULT: -1
TEST: (hdr 0, xdp 1, pi 1, features 108000000) RESULT: -1

> IFF_NO_PI + IFF_VNET_HDR, if I understand correctly. 

That's fairly much the only one that *did* work. As long as you use
TUNSETSNDBUF which has the undocumented side-effect of turning off the
XDP path.

> > On the receive side, where the tun device generates the virtio_net_hdr
> > but VIRITO_NET_F_MSG_RXBUF was negotiated and vhost-net needs to fill
> 
> Nit: VIRTIO_NET_F_MSG_RXBUF

Thanks.

> > in the 'num_buffers' field on top of the existing virtio_net_hdr, fix
> > that to use 'sock_hlen - 2' as the location, which means that it goes
> 
> Please use sizeof(hdr.num_buffers) instead of a raw constant 2, to
> self document the code.

Makes sense.

> Should this be an independent one-line fix?

I don't think so; it's very much intertwined with the way it makes
assumptions about someone else's data.
Jason Wang June 28, 2021, 4:22 a.m. UTC | #5
在 2021/6/25 下午4:23, David Woodhouse 写道:
> On Fri, 2021-06-25 at 13:00 +0800, Jason Wang wrote:
>> 在 2021/6/24 下午8:30, David Woodhouse 写道:
>>> From: David Woodhouse <dwmw@amazon.co.uk>
>>>
>>> The vhost-net driver was making wild assumptions about the header length
>>> of the underlying tun/tap socket.
>>
>> It's by design to depend on the userspace to co-ordinate the vnet header
>> setting with the underlying sockets.
>>
>>
>>>    Then it was discarding packets if
>>> the number of bytes it got from sock_recvmsg() didn't precisely match
>>> its guess.
>>
>> Anything that is broken by this? The failure is a hint for the userspace
>> that something is wrong during the coordination.
> I am not a fan of this approach. I firmly believe that for a given
> configuration, the kernel should either *work* or it should gracefully
> refuse to set it up that way. And the requirements should be clearly
> documented.


That works only if all the logic were implemented in the same module but 
not the case in the e.g networking stack that a packet need to iterate 
several modules.

E.g in this case, the vnet header size of the TAP could be changed at 
anytime via TUNSETVNETHDRSZ, and tuntap is unaware of the existence of 
vhost_net. This makes it impossible to do refuse in the case of setup 
(SET_BACKEND).


>
> Having been on the receiving end of this "hint" of which you speak, I
> found it distinctly suboptimal as a user interface. I was left
> scrabbling around trying to find a set of options which *would* work,
> and it was only through debugging the kernel that I managed to work out
> that I:
>
>    • MUST set IFF_NO_PI
>    • MUST use TUNSETSNDBUF to reduce the sndbuf from INT_MAX
>    • MUST use a virtio_net_hdr that I don't want
>
> If my application failed to do any of those things, I got a silent
> failure to transport any packets.


Yes, this is because the bug when using vhost_net + PI/TUN. And I guess 
the reason is that nobody tries to use that combination in the past.

I'm not even sure if it's a valid setup since vhost-net is a virtio-net 
kernel server which is not expected to handle L3 packets or PI header 
(which is Linux specific and out of the scope virtio spec).


>   The only thing I could do *without*
> debugging the kernel was tcpdump on the 'tun0' interface and see if the
> TX packets I put into the ring were even making it to the interface,
> and what they looked like if they did. (Losing the first 14 bytes and
> having the *next* 14 bytes scribbled on by an Ethernet header was a fun
> one.)


The tricky part is that, the networking stack thinks the packet is 
successfully received but it was actually dropped by vhost-net.

And there's no obvious userspace API to report such dropping as 
statistics counters or trace-points. Maybe we can tweak the vhost for a 
better logging in this case.


>
>
>
>
>
>>> Fix it to get the correct information along with the socket itself.
>>
>> I'm not sure what is fixed by this. It looks to me it tires to let
>> packet go even if the userspace set the wrong attributes to tap or
>> vhost. This is even sub-optimal than failing explicitly fail the RX.
> I'm OK with explicit failure. But once I'd let it *get* the information
> from the underlying socket in order to decide whether it should fail or
> not, it turned out to be easy enough just to make those configs work
> anyway.


The problem is that this change may make some wrong configuration 
"works" silently at the level of vhost or TAP. When using this for VM, 
it would make the debugging even harder.


>
> The main case where that "easy enough" is stretched a little (IMO) was
> when there's a tun_pi header. I have one more of your emails to reply
> to after this, and I'll address that there.
>
>
>>> As a side-effect, this means that tun_get_socket() won't work if the
>>> tun file isn't actually connected to a device, since there's no 'tun'
>>> yet in that case to get the information from.
>>
>> This may break the existing application. Vhost-net is tied to the socket
>> instead of the device that the socket is loosely coupled.
> Hm. Perhaps the PI and vnet hdr should be considered an option of the
> *socket* (which is tied to the tfile), not purely an option of the
> underlying device?


Though this is how it is done in macvtap. It's probably too late to 
change tuntap.


>
> Or maybe it's sufficient just to get the flags from *either* tfile->tun
> or tfile->detached, so that it works when the queue is detached. I'll
> take a look.
>
> I suppose we could even have a fallback that makes stuff up like we do
> today. If the user attempts to attach a tun file descriptor to vhost
> without ever calling TUNSETIFF on it first, *then* we make the same
> assumptions we do today?


Then I would rather keep the using the assumption:

1) the value get from get_socket() might not be correct
2) the complexity or risk for bring a very little improvement of the 
debug-ability (which is still suspicious).


>
>>> --- a/drivers/vhost/net.c
>>> +++ b/drivers/vhost/net.c
>>> @@ -1143,7 +1143,8 @@ static void handle_rx(struct vhost_net *net)
>>>    
>>>    	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
>>>    		vq->log : NULL;
>>> -	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
>>> +	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) &&
>>> +		(vhost_hlen || sock_hlen >= sizeof(num_buffers));
>>
>> So this change the behavior. When mergeable buffer is enabled, userspace
>> expects the vhost to merge buffers. If the feature is disabled silently,
>> it violates virtio spec.
>>
>> If anything wrong in the setup, userspace just breaks itself.
>>
>> E.g if sock_hlen is less that struct virtio_net_hdr_mrg_buf. The packet
>> header might be overwrote by the vnet header.
> This wasn't intended to change the behaviour of any code path that is
> already working today. If *either* vhost or the underlying device have
> provided a vnet header, we still merge.
>
> If *neither* provide a vnet hdr, there's nowhere to put num_buffers and
> we can't merge.
>
> That code path doesn't work at all today, but does after my patches.


It looks to me it's a bug that userspace can keep working in this case. 
After mrg rx buffer is negotiated, userspace should always assumes the 
vhost-net to provide num_buffers.

> But you're right, we should explicitly refuse to negotiate
> VIRITO_NET_F_MSG_RXBUF in that case.


This would be very hard:

1) VHOST_SET_FEATURES and VHOST_NET_SET_BACKEND are two different ioctls
2) vhost_net is not tightly coupled with tuntap, vnet header size could 
be changed by userspace at any time


>
>>>    
>>>    	do {
>>>    		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
>>> @@ -1213,9 +1214,10 @@ static void handle_rx(struct vhost_net *net)
>>>    			}
>>>    		} else {
>>>    			/* Header came from socket; we'll need to patch
>>> -			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
>>> +			 * ->num_buffers over the last two bytes if
>>> +			 * VIRTIO_NET_F_MRG_RXBUF is enabled.
>>>    			 */
>>> -			iov_iter_advance(&fixup, sizeof(hdr));
>>> +			iov_iter_advance(&fixup, sock_hlen - 2);
>>
>> I'm not sure what did the above code want to fix. It doesn't change
>> anything if vnet header is set correctly in TUN. It only prevents the
>> the packet header from being rewrote.
>>
> It fixes the case where the virtio_net_hdr isn't at the start of the
> tun header, because the tun actually puts the tun_pi struct *first*,
> and *then* the virtio_net_hdr.


Right.


> The num_buffers field needs to go at the *end* of sock_hlen. Not at a
> fixed offset from the *start* of it.
>
> At least, that's true unless we want to just declare that we *only*
> support TUN with the IFF_NO_PI flag. (qv).


Yes, that's a good question. This is probably a hint that "vhost-net is 
never designed to work of PI", and even if it's not true, I'm not sure 
if it's too late to fix.

Thanks
diff mbox series

Patch

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 8e3a28ba6b28..2170a0d3d34c 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1246,7 +1246,7 @@  static const struct proto_ops tap_socket_ops = {
  * attached to a device.  The returned object works like a packet socket, it
  * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
  * holding a reference to the file for as long as the socket is in use. */
-struct socket *tap_get_socket(struct file *file)
+struct socket *tap_get_socket(struct file *file, size_t *hlen)
 {
 	struct tap_queue *q;
 	if (file->f_op != &tap_fops)
@@ -1254,6 +1254,9 @@  struct socket *tap_get_socket(struct file *file)
 	q = file->private_data;
 	if (!q)
 		return ERR_PTR(-EBADFD);
+	if (hlen)
+		*hlen = (q->flags & IFF_VNET_HDR) ? q->vnet_hdr_sz : 0;
+
 	return &q->sock;
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4cf38be26dc9..67b406fa0881 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3649,7 +3649,7 @@  static void tun_cleanup(void)
  * attached to a device.  The returned object works like a packet socket, it
  * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
  * holding a reference to the file for as long as the socket is in use. */
-struct socket *tun_get_socket(struct file *file)
+struct socket *tun_get_socket(struct file *file, size_t *hlen)
 {
 	struct tun_file *tfile;
 	if (file->f_op != &tun_fops)
@@ -3657,6 +3657,20 @@  struct socket *tun_get_socket(struct file *file)
 	tfile = file->private_data;
 	if (!tfile)
 		return ERR_PTR(-EBADFD);
+
+	if (hlen) {
+		struct tun_struct *tun = tun_get(tfile);
+		size_t len = 0;
+
+		if (!tun)
+			return ERR_PTR(-ENOTCONN);
+		if (tun->flags & IFF_VNET_HDR)
+			len += READ_ONCE(tun->vnet_hdr_sz);
+		if (!(tun->flags & IFF_NO_PI))
+			len += sizeof(struct tun_pi);
+		tun_put(tun);
+		*hlen = len;
+	}
 	return &tfile->socket;
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index df82b124170e..b92a7144ed90 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1143,7 +1143,8 @@  static void handle_rx(struct vhost_net *net)
 
 	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
-	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) &&
+		(vhost_hlen || sock_hlen >= sizeof(num_buffers));
 
 	do {
 		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
@@ -1213,9 +1214,10 @@  static void handle_rx(struct vhost_net *net)
 			}
 		} else {
 			/* Header came from socket; we'll need to patch
-			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
+			 * ->num_buffers over the last two bytes if
+			 * VIRTIO_NET_F_MRG_RXBUF is enabled.
 			 */
-			iov_iter_advance(&fixup, sizeof(hdr));
+			iov_iter_advance(&fixup, sock_hlen - 2);
 		}
 		/* TODO: Should check and handle checksum. */
 
@@ -1420,7 +1422,7 @@  static int vhost_net_release(struct inode *inode, struct file *f)
 	return 0;
 }
 
-static struct socket *get_raw_socket(int fd)
+static struct socket *get_raw_socket(int fd, size_t *hlen)
 {
 	int r;
 	struct socket *sock = sockfd_lookup(fd, &r);
@@ -1438,6 +1440,7 @@  static struct socket *get_raw_socket(int fd)
 		r = -EPFNOSUPPORT;
 		goto err;
 	}
+	*hlen = 0;
 	return sock;
 err:
 	sockfd_put(sock);
@@ -1463,33 +1466,33 @@  static struct ptr_ring *get_tap_ptr_ring(int fd)
 	return ring;
 }
 
-static struct socket *get_tap_socket(int fd)
+static struct socket *get_tap_socket(int fd, size_t *hlen)
 {
 	struct file *file = fget(fd);
 	struct socket *sock;
 
 	if (!file)
 		return ERR_PTR(-EBADF);
-	sock = tun_get_socket(file);
+	sock = tun_get_socket(file, hlen);
 	if (!IS_ERR(sock))
 		return sock;
-	sock = tap_get_socket(file);
+	sock = tap_get_socket(file, hlen);
 	if (IS_ERR(sock))
 		fput(file);
 	return sock;
 }
 
-static struct socket *get_socket(int fd)
+static struct socket *get_socket(int fd, size_t *hlen)
 {
 	struct socket *sock;
 
 	/* special case to disable backend */
 	if (fd == -1)
 		return NULL;
-	sock = get_raw_socket(fd);
+	sock = get_raw_socket(fd, hlen);
 	if (!IS_ERR(sock))
 		return sock;
-	sock = get_tap_socket(fd);
+	sock = get_tap_socket(fd, hlen);
 	if (!IS_ERR(sock))
 		return sock;
 	return ERR_PTR(-ENOTSOCK);
@@ -1521,7 +1524,7 @@  static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		r = -EFAULT;
 		goto err_vq;
 	}
-	sock = get_socket(fd);
+	sock = get_socket(fd, &nvq->sock_hlen);
 	if (IS_ERR(sock)) {
 		r = PTR_ERR(sock);
 		goto err_vq;
@@ -1621,7 +1624,7 @@  static long vhost_net_reset_owner(struct vhost_net *n)
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-	size_t vhost_hlen, sock_hlen, hdr_len;
+	size_t vhost_hlen, hdr_len;
 	int i;
 
 	hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
@@ -1631,11 +1634,8 @@  static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
 		/* vhost provides vnet_hdr */
 		vhost_hlen = hdr_len;
-		sock_hlen = 0;
 	} else {
-		/* socket provides vnet_hdr */
 		vhost_hlen = 0;
-		sock_hlen = hdr_len;
 	}
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
@@ -1651,7 +1651,6 @@  static int vhost_net_set_features(struct vhost_net *n, u64 features)
 		mutex_lock(&n->vqs[i].vq.mutex);
 		n->vqs[i].vq.acked_features = features;
 		n->vqs[i].vhost_hlen = vhost_hlen;
-		n->vqs[i].sock_hlen = sock_hlen;
 		mutex_unlock(&n->vqs[i].vq.mutex);
 	}
 	mutex_unlock(&n->dev.mutex);
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 915a187cfabd..b460ba98f34e 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -3,14 +3,14 @@ 
 #define _LINUX_IF_TAP_H_
 
 #if IS_ENABLED(CONFIG_TAP)
-struct socket *tap_get_socket(struct file *);
+struct socket *tap_get_socket(struct file *, size_t *);
 struct ptr_ring *tap_get_ptr_ring(struct file *file);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
 struct file;
 struct socket;
-static inline struct socket *tap_get_socket(struct file *f)
+static inline struct socket *tap_get_socket(struct file *f, size_t *)
 {
 	return ERR_PTR(-EINVAL);
 }
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 2a7660843444..8a7debd3f663 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -25,7 +25,7 @@  struct tun_xdp_hdr {
 };
 
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
-struct socket *tun_get_socket(struct file *);
+struct socket *tun_get_socket(struct file *, size_t *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
 static inline bool tun_is_xdp_frame(void *ptr)
 {
@@ -45,7 +45,7 @@  void tun_ptr_free(void *ptr);
 #include <linux/errno.h>
 struct file;
 struct socket;
-static inline struct socket *tun_get_socket(struct file *f)
+static inline struct socket *tun_get_socket(struct file *f, size_t *)
 {
 	return ERR_PTR(-EINVAL);
 }