Message ID | 20210624123005.1301761-1-dwmw2@infradead.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [v3,1/5] net: add header len parameter to tun_get_socket(), tap_get_socket() | expand |
Context | Check | Description |
---|---|---|
netdev/cover_letter | warning | Series does not have a cover letter |
netdev/fixes_present | success | Link |
netdev/patch_count | success | Link |
netdev/tree_selection | success | Guessed tree name to be net-next |
netdev/subject_prefix | success | Link |
netdev/cc_maintainers | fail | 2 blamed authors not CCed: mst@redhat.com dlstevens@us.ibm.com; 6 maintainers not CCed: kvm@vger.kernel.org mst@redhat.com dlstevens@us.ibm.com virtualization@lists.linux-foundation.org davem@davemloft.net kuba@kernel.org |
netdev/source_inline | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Link |
netdev/module_param | success | Was 0 now: 0 |
netdev/build_32bit | success | Errors and warnings before: 1 this patch: 1 |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/verify_fixes | success | Link |
netdev/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 186 lines checked |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 1 this patch: 1 |
netdev/header_inline | success | Link |
在 2021/6/24 下午8:30, David Woodhouse 写道: > From: David Woodhouse <dwmw@amazon.co.uk> > > The vhost-net driver was making wild assumptions about the header length > of the underlying tun/tap socket. It's by design to depend on the userspace to co-ordinate the vnet header setting with the underlying sockets. > Then it was discarding packets if > the number of bytes it got from sock_recvmsg() didn't precisely match > its guess. Anything that is broken by this? The failure is a hint for the userspace that something is wrong during the coordination. > > Fix it to get the correct information along with the socket itself. I'm not sure what is fixed by this. It looks to me it tires to let packet go even if the userspace set the wrong attributes to tap or vhost. This is even sub-optimal than failing explicitly fail the RX. > As a side-effect, this means that tun_get_socket() won't work if the > tun file isn't actually connected to a device, since there's no 'tun' > yet in that case to get the information from. This may break the existing application. Vhost-net is tied to the socket instead of the device that the socket is loosely coupled. > > On the receive side, where the tun device generates the virtio_net_hdr > but VIRITO_NET_F_MSG_RXBUF was negotiated and vhost-net needs to fill > in the 'num_buffers' field on top of the existing virtio_net_hdr, fix > that to use 'sock_hlen - 2' as the location, which means that it goes > in the right place regardless of whether the tun device is using an > additional tun_pi header or not. In this case, the user should have > configured the tun device with a vnet hdr size of 12, to make room. > > Fixes: 8dd014adfea6f ("vhost-net: mergeable buffers support") > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > --- > drivers/net/tap.c | 5 ++++- > drivers/net/tun.c | 16 +++++++++++++++- > drivers/vhost/net.c | 31 +++++++++++++++---------------- > include/linux/if_tap.h | 4 ++-- > include/linux/if_tun.h | 4 ++-- > 5 files changed, 38 insertions(+), 22 deletions(-) > > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index 8e3a28ba6b28..2170a0d3d34c 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -1246,7 +1246,7 @@ static const struct proto_ops tap_socket_ops = { > * attached to a device. The returned object works like a packet socket, it > * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for > * holding a reference to the file for as long as the socket is in use. */ > -struct socket *tap_get_socket(struct file *file) > +struct socket *tap_get_socket(struct file *file, size_t *hlen) > { > struct tap_queue *q; > if (file->f_op != &tap_fops) > @@ -1254,6 +1254,9 @@ struct socket *tap_get_socket(struct file *file) > q = file->private_data; > if (!q) > return ERR_PTR(-EBADFD); > + if (hlen) > + *hlen = (q->flags & IFF_VNET_HDR) ? q->vnet_hdr_sz : 0; > + > return &q->sock; > } > EXPORT_SYMBOL_GPL(tap_get_socket); > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 4cf38be26dc9..67b406fa0881 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -3649,7 +3649,7 @@ static void tun_cleanup(void) > * attached to a device. The returned object works like a packet socket, it > * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for > * holding a reference to the file for as long as the socket is in use. */ > -struct socket *tun_get_socket(struct file *file) > +struct socket *tun_get_socket(struct file *file, size_t *hlen) > { > struct tun_file *tfile; > if (file->f_op != &tun_fops) > @@ -3657,6 +3657,20 @@ struct socket *tun_get_socket(struct file *file) > tfile = file->private_data; > if (!tfile) > return ERR_PTR(-EBADFD); > + > + if (hlen) { > + struct tun_struct *tun = tun_get(tfile); > + size_t len = 0; > + > + if (!tun) > + return ERR_PTR(-ENOTCONN); > + if (tun->flags & IFF_VNET_HDR) > + len += READ_ONCE(tun->vnet_hdr_sz); > + if (!(tun->flags & IFF_NO_PI)) > + len += sizeof(struct tun_pi); > + tun_put(tun); > + *hlen = len; > + } > return &tfile->socket; > } > EXPORT_SYMBOL_GPL(tun_get_socket); > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index df82b124170e..b92a7144ed90 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -1143,7 +1143,8 @@ static void handle_rx(struct vhost_net *net) > > vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? > vq->log : NULL; > - mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); > + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) && > + (vhost_hlen || sock_hlen >= sizeof(num_buffers)); So this change the behavior. When mergeable buffer is enabled, userspace expects the vhost to merge buffers. If the feature is disabled silently, it violates virtio spec. If anything wrong in the setup, userspace just breaks itself. E.g if sock_hlen is less that struct virtio_net_hdr_mrg_buf. The packet header might be overwrote by the vnet header. > > do { > sock_len = vhost_net_rx_peek_head_len(net, sock->sk, > @@ -1213,9 +1214,10 @@ static void handle_rx(struct vhost_net *net) > } > } else { > /* Header came from socket; we'll need to patch > - * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF > + * ->num_buffers over the last two bytes if > + * VIRTIO_NET_F_MRG_RXBUF is enabled. > */ > - iov_iter_advance(&fixup, sizeof(hdr)); > + iov_iter_advance(&fixup, sock_hlen - 2); I'm not sure what did the above code want to fix. It doesn't change anything if vnet header is set correctly in TUN. It only prevents the the packet header from being rewrote. Thanks
On Fri, 2021-06-25 at 13:00 +0800, Jason Wang wrote: > 在 2021/6/24 下午8:30, David Woodhouse 写道: > > From: David Woodhouse <dwmw@amazon.co.uk> > > > > The vhost-net driver was making wild assumptions about the header length > > of the underlying tun/tap socket. > > > It's by design to depend on the userspace to co-ordinate the vnet header > setting with the underlying sockets. > > > > Then it was discarding packets if > > the number of bytes it got from sock_recvmsg() didn't precisely match > > its guess. > > > Anything that is broken by this? The failure is a hint for the userspace > that something is wrong during the coordination. I am not a fan of this approach. I firmly believe that for a given configuration, the kernel should either *work* or it should gracefully refuse to set it up that way. And the requirements should be clearly documented. Having been on the receiving end of this "hint" of which you speak, I found it distinctly suboptimal as a user interface. I was left scrabbling around trying to find a set of options which *would* work, and it was only through debugging the kernel that I managed to work out that I: • MUST set IFF_NO_PI • MUST use TUNSETSNDBUF to reduce the sndbuf from INT_MAX • MUST use a virtio_net_hdr that I don't want If my application failed to do any of those things, I got a silent failure to transport any packets. The only thing I could do *without* debugging the kernel was tcpdump on the 'tun0' interface and see if the TX packets I put into the ring were even making it to the interface, and what they looked like if they did. (Losing the first 14 bytes and having the *next* 14 bytes scribbled on by an Ethernet header was a fun one.) > > > > Fix it to get the correct information along with the socket itself. > > > I'm not sure what is fixed by this. It looks to me it tires to let > packet go even if the userspace set the wrong attributes to tap or > vhost. This is even sub-optimal than failing explicitly fail the RX. I'm OK with explicit failure. But once I'd let it *get* the information from the underlying socket in order to decide whether it should fail or not, it turned out to be easy enough just to make those configs work anyway. The main case where that "easy enough" is stretched a little (IMO) was when there's a tun_pi header. I have one more of your emails to reply to after this, and I'll address that there. > > > As a side-effect, this means that tun_get_socket() won't work if the > > tun file isn't actually connected to a device, since there's no 'tun' > > yet in that case to get the information from. > > > This may break the existing application. Vhost-net is tied to the socket > instead of the device that the socket is loosely coupled. Hm. Perhaps the PI and vnet hdr should be considered an option of the *socket* (which is tied to the tfile), not purely an option of the underlying device? Or maybe it's sufficient just to get the flags from *either* tfile->tun or tfile->detached, so that it works when the queue is detached. I'll take a look. I suppose we could even have a fallback that makes stuff up like we do today. If the user attempts to attach a tun file descriptor to vhost without ever calling TUNSETIFF on it first, *then* we make the same assumptions we do today? > > --- a/drivers/vhost/net.c > > +++ b/drivers/vhost/net.c > > @@ -1143,7 +1143,8 @@ static void handle_rx(struct vhost_net *net) > > > > vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? > > vq->log : NULL; > > - mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); > > + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) && > > + (vhost_hlen || sock_hlen >= sizeof(num_buffers)); > > > So this change the behavior. When mergeable buffer is enabled, userspace > expects the vhost to merge buffers. If the feature is disabled silently, > it violates virtio spec. > > If anything wrong in the setup, userspace just breaks itself. > > E.g if sock_hlen is less that struct virtio_net_hdr_mrg_buf. The packet > header might be overwrote by the vnet header. This wasn't intended to change the behaviour of any code path that is already working today. If *either* vhost or the underlying device have provided a vnet header, we still merge. If *neither* provide a vnet hdr, there's nowhere to put num_buffers and we can't merge. That code path doesn't work at all today, but does after my patches. But you're right, we should explicitly refuse to negotiate VIRITO_NET_F_MSG_RXBUF in that case. > > > > > do { > > sock_len = vhost_net_rx_peek_head_len(net, sock->sk, > > @@ -1213,9 +1214,10 @@ static void handle_rx(struct vhost_net *net) > > } > > } else { > > /* Header came from socket; we'll need to patch > > - * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF > > + * ->num_buffers over the last two bytes if > > + * VIRTIO_NET_F_MRG_RXBUF is enabled. > > */ > > - iov_iter_advance(&fixup, sizeof(hdr)); > > + iov_iter_advance(&fixup, sock_hlen - 2); > > > I'm not sure what did the above code want to fix. It doesn't change > anything if vnet header is set correctly in TUN. It only prevents the > the packet header from being rewrote. > It fixes the case where the virtio_net_hdr isn't at the start of the tun header, because the tun actually puts the tun_pi struct *first*, and *then* the virtio_net_hdr. The num_buffers field needs to go at the *end* of sock_hlen. Not at a fixed offset from the *start* of it. At least, that's true unless we want to just declare that we *only* support TUN with the IFF_NO_PI flag. (qv).
On Thu, Jun 24, 2021 at 8:30 AM David Woodhouse <dwmw2@infradead.org> wrote: > > From: David Woodhouse <dwmw@amazon.co.uk> > > The vhost-net driver was making wild assumptions about the header length If respinning, please more concretely describe which configuration is currently broken. IFF_NO_PI + IFF_VNET_HDR, if I understand correctly. But I got that from the discussion, not the commit message. > of the underlying tun/tap socket. Then it was discarding packets if > the number of bytes it got from sock_recvmsg() didn't precisely match > its guess. > > Fix it to get the correct information along with the socket itself. > As a side-effect, this means that tun_get_socket() won't work if the > tun file isn't actually connected to a device, since there's no 'tun' > yet in that case to get the information from. > > On the receive side, where the tun device generates the virtio_net_hdr > but VIRITO_NET_F_MSG_RXBUF was negotiated and vhost-net needs to fill Nit: VIRTIO_NET_F_MSG_RXBUF > in the 'num_buffers' field on top of the existing virtio_net_hdr, fix > that to use 'sock_hlen - 2' as the location, which means that it goes Please use sizeof(hdr.num_buffers) instead of a raw constant 2, to self document the code. Should this be an independent one-line fix?
On Fri, 2021-06-25 at 14:13 -0400, Willem de Bruijn wrote: > On Thu, Jun 24, 2021 at 8:30 AM David Woodhouse <dwmw2@infradead.org> > wrote: > > > > From: David Woodhouse <dwmw@amazon.co.uk> > > > > The vhost-net driver was making wild assumptions about the header > > length > > If respinning, please more concretely describe which configuration is > currently broken. Fairly much all of them. Here's a test run on the 5.12.8 kernel: $ sudo ./test_vhost_net TEST: (hdr 0, xdp 0, pi 0, features 0) RESULT: -1 TEST: (hdr 10, xdp 0, pi 0, features 0) RESULT: 0 TEST: (hdr 12, xdp 0, pi 0, features 0) RESULT: -1 TEST: (hdr 20, xdp 0, pi 0, features 0) RESULT: -1 TEST: (hdr 0, xdp 1, pi 0, features 0) RESULT: -1 TEST: (hdr 10, xdp 1, pi 0, features 0) RESULT: -1 TEST: (hdr 12, xdp 1, pi 0, features 0) RESULT: -1 TEST: (hdr 20, xdp 1, pi 0, features 0) RESULT: -1 TEST: (hdr 0, xdp 0, pi 1, features 0) RESULT: -1 TEST: (hdr 10, xdp 0, pi 1, features 0) RESULT: -1 TEST: (hdr 12, xdp 0, pi 1, features 0) RESULT: -1 TEST: (hdr 20, xdp 0, pi 1, features 0) RESULT: -1 TEST: (hdr 0, xdp 1, pi 1, features 0) RESULT: -1 TEST: (hdr 10, xdp 1, pi 1, features 0) RESULT: -1 TEST: (hdr 12, xdp 1, pi 1, features 0) RESULT: -1 TEST: (hdr 20, xdp 1, pi 1, features 0) RESULT: -1 TEST: (hdr 0, xdp 0, pi 0, features 100000000) RESULT: -1 TEST: (hdr 10, xdp 0, pi 0, features 100000000) RESULT: -1 TEST: (hdr 12, xdp 0, pi 0, features 100000000) RESULT: 0 TEST: (hdr 20, xdp 0, pi 0, features 100000000) RESULT: -1 TEST: (hdr 0, xdp 1, pi 0, features 100000000) RESULT: -1 TEST: (hdr 10, xdp 1, pi 0, features 100000000) RESULT: -1 TEST: (hdr 12, xdp 1, pi 0, features 100000000) RESULT: -1 TEST: (hdr 20, xdp 1, pi 0, features 100000000) RESULT: -1 TEST: (hdr 0, xdp 0, pi 1, features 100000000) RESULT: -1 TEST: (hdr 10, xdp 0, pi 1, features 100000000) RESULT: -1 TEST: (hdr 12, xdp 0, pi 1, features 100000000) RESULT: -1 TEST: (hdr 20, xdp 0, pi 1, features 100000000) RESULT: -1 TEST: (hdr 0, xdp 1, pi 1, features 100000000) RESULT: -1 TEST: (hdr 10, xdp 1, pi 1, features 100000000) RESULT: -1 TEST: (hdr 12, xdp 1, pi 1, features 100000000) RESULT: -1 TEST: (hdr 20, xdp 1, pi 1, features 100000000) RESULT: -1 TEST: (hdr 0, xdp 0, pi 0, features 8000000) RESULT: 0 TEST: (hdr 0, xdp 1, pi 0, features 8000000) RESULT: -1 TEST: (hdr 0, xdp 0, pi 1, features 8000000) RESULT: -1 TEST: (hdr 0, xdp 1, pi 1, features 8000000) RESULT: -1 TEST: (hdr 0, xdp 0, pi 0, features 108000000) RESULT: 0 TEST: (hdr 0, xdp 1, pi 0, features 108000000) RESULT: -1 TEST: (hdr 0, xdp 0, pi 1, features 108000000) RESULT: -1 TEST: (hdr 0, xdp 1, pi 1, features 108000000) RESULT: -1 > IFF_NO_PI + IFF_VNET_HDR, if I understand correctly. That's fairly much the only one that *did* work. As long as you use TUNSETSNDBUF which has the undocumented side-effect of turning off the XDP path. > > On the receive side, where the tun device generates the virtio_net_hdr > > but VIRITO_NET_F_MSG_RXBUF was negotiated and vhost-net needs to fill > > Nit: VIRTIO_NET_F_MSG_RXBUF Thanks. > > in the 'num_buffers' field on top of the existing virtio_net_hdr, fix > > that to use 'sock_hlen - 2' as the location, which means that it goes > > Please use sizeof(hdr.num_buffers) instead of a raw constant 2, to > self document the code. Makes sense. > Should this be an independent one-line fix? I don't think so; it's very much intertwined with the way it makes assumptions about someone else's data.
在 2021/6/25 下午4:23, David Woodhouse 写道: > On Fri, 2021-06-25 at 13:00 +0800, Jason Wang wrote: >> 在 2021/6/24 下午8:30, David Woodhouse 写道: >>> From: David Woodhouse <dwmw@amazon.co.uk> >>> >>> The vhost-net driver was making wild assumptions about the header length >>> of the underlying tun/tap socket. >> >> It's by design to depend on the userspace to co-ordinate the vnet header >> setting with the underlying sockets. >> >> >>> Then it was discarding packets if >>> the number of bytes it got from sock_recvmsg() didn't precisely match >>> its guess. >> >> Anything that is broken by this? The failure is a hint for the userspace >> that something is wrong during the coordination. > I am not a fan of this approach. I firmly believe that for a given > configuration, the kernel should either *work* or it should gracefully > refuse to set it up that way. And the requirements should be clearly > documented. That works only if all the logic were implemented in the same module but not the case in the e.g networking stack that a packet need to iterate several modules. E.g in this case, the vnet header size of the TAP could be changed at anytime via TUNSETVNETHDRSZ, and tuntap is unaware of the existence of vhost_net. This makes it impossible to do refuse in the case of setup (SET_BACKEND). > > Having been on the receiving end of this "hint" of which you speak, I > found it distinctly suboptimal as a user interface. I was left > scrabbling around trying to find a set of options which *would* work, > and it was only through debugging the kernel that I managed to work out > that I: > > • MUST set IFF_NO_PI > • MUST use TUNSETSNDBUF to reduce the sndbuf from INT_MAX > • MUST use a virtio_net_hdr that I don't want > > If my application failed to do any of those things, I got a silent > failure to transport any packets. Yes, this is because the bug when using vhost_net + PI/TUN. And I guess the reason is that nobody tries to use that combination in the past. I'm not even sure if it's a valid setup since vhost-net is a virtio-net kernel server which is not expected to handle L3 packets or PI header (which is Linux specific and out of the scope virtio spec). > The only thing I could do *without* > debugging the kernel was tcpdump on the 'tun0' interface and see if the > TX packets I put into the ring were even making it to the interface, > and what they looked like if they did. (Losing the first 14 bytes and > having the *next* 14 bytes scribbled on by an Ethernet header was a fun > one.) The tricky part is that, the networking stack thinks the packet is successfully received but it was actually dropped by vhost-net. And there's no obvious userspace API to report such dropping as statistics counters or trace-points. Maybe we can tweak the vhost for a better logging in this case. > > > > > >>> Fix it to get the correct information along with the socket itself. >> >> I'm not sure what is fixed by this. It looks to me it tires to let >> packet go even if the userspace set the wrong attributes to tap or >> vhost. This is even sub-optimal than failing explicitly fail the RX. > I'm OK with explicit failure. But once I'd let it *get* the information > from the underlying socket in order to decide whether it should fail or > not, it turned out to be easy enough just to make those configs work > anyway. The problem is that this change may make some wrong configuration "works" silently at the level of vhost or TAP. When using this for VM, it would make the debugging even harder. > > The main case where that "easy enough" is stretched a little (IMO) was > when there's a tun_pi header. I have one more of your emails to reply > to after this, and I'll address that there. > > >>> As a side-effect, this means that tun_get_socket() won't work if the >>> tun file isn't actually connected to a device, since there's no 'tun' >>> yet in that case to get the information from. >> >> This may break the existing application. Vhost-net is tied to the socket >> instead of the device that the socket is loosely coupled. > Hm. Perhaps the PI and vnet hdr should be considered an option of the > *socket* (which is tied to the tfile), not purely an option of the > underlying device? Though this is how it is done in macvtap. It's probably too late to change tuntap. > > Or maybe it's sufficient just to get the flags from *either* tfile->tun > or tfile->detached, so that it works when the queue is detached. I'll > take a look. > > I suppose we could even have a fallback that makes stuff up like we do > today. If the user attempts to attach a tun file descriptor to vhost > without ever calling TUNSETIFF on it first, *then* we make the same > assumptions we do today? Then I would rather keep the using the assumption: 1) the value get from get_socket() might not be correct 2) the complexity or risk for bring a very little improvement of the debug-ability (which is still suspicious). > >>> --- a/drivers/vhost/net.c >>> +++ b/drivers/vhost/net.c >>> @@ -1143,7 +1143,8 @@ static void handle_rx(struct vhost_net *net) >>> >>> vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? >>> vq->log : NULL; >>> - mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); >>> + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) && >>> + (vhost_hlen || sock_hlen >= sizeof(num_buffers)); >> >> So this change the behavior. When mergeable buffer is enabled, userspace >> expects the vhost to merge buffers. If the feature is disabled silently, >> it violates virtio spec. >> >> If anything wrong in the setup, userspace just breaks itself. >> >> E.g if sock_hlen is less that struct virtio_net_hdr_mrg_buf. The packet >> header might be overwrote by the vnet header. > This wasn't intended to change the behaviour of any code path that is > already working today. If *either* vhost or the underlying device have > provided a vnet header, we still merge. > > If *neither* provide a vnet hdr, there's nowhere to put num_buffers and > we can't merge. > > That code path doesn't work at all today, but does after my patches. It looks to me it's a bug that userspace can keep working in this case. After mrg rx buffer is negotiated, userspace should always assumes the vhost-net to provide num_buffers. > But you're right, we should explicitly refuse to negotiate > VIRITO_NET_F_MSG_RXBUF in that case. This would be very hard: 1) VHOST_SET_FEATURES and VHOST_NET_SET_BACKEND are two different ioctls 2) vhost_net is not tightly coupled with tuntap, vnet header size could be changed by userspace at any time > >>> >>> do { >>> sock_len = vhost_net_rx_peek_head_len(net, sock->sk, >>> @@ -1213,9 +1214,10 @@ static void handle_rx(struct vhost_net *net) >>> } >>> } else { >>> /* Header came from socket; we'll need to patch >>> - * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF >>> + * ->num_buffers over the last two bytes if >>> + * VIRTIO_NET_F_MRG_RXBUF is enabled. >>> */ >>> - iov_iter_advance(&fixup, sizeof(hdr)); >>> + iov_iter_advance(&fixup, sock_hlen - 2); >> >> I'm not sure what did the above code want to fix. It doesn't change >> anything if vnet header is set correctly in TUN. It only prevents the >> the packet header from being rewrote. >> > It fixes the case where the virtio_net_hdr isn't at the start of the > tun header, because the tun actually puts the tun_pi struct *first*, > and *then* the virtio_net_hdr. Right. > The num_buffers field needs to go at the *end* of sock_hlen. Not at a > fixed offset from the *start* of it. > > At least, that's true unless we want to just declare that we *only* > support TUN with the IFF_NO_PI flag. (qv). Yes, that's a good question. This is probably a hint that "vhost-net is never designed to work of PI", and even if it's not true, I'm not sure if it's too late to fix. Thanks
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 8e3a28ba6b28..2170a0d3d34c 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1246,7 +1246,7 @@ static const struct proto_ops tap_socket_ops = { * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ -struct socket *tap_get_socket(struct file *file) +struct socket *tap_get_socket(struct file *file, size_t *hlen) { struct tap_queue *q; if (file->f_op != &tap_fops) @@ -1254,6 +1254,9 @@ struct socket *tap_get_socket(struct file *file) q = file->private_data; if (!q) return ERR_PTR(-EBADFD); + if (hlen) + *hlen = (q->flags & IFF_VNET_HDR) ? q->vnet_hdr_sz : 0; + return &q->sock; } EXPORT_SYMBOL_GPL(tap_get_socket); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 4cf38be26dc9..67b406fa0881 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3649,7 +3649,7 @@ static void tun_cleanup(void) * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ -struct socket *tun_get_socket(struct file *file) +struct socket *tun_get_socket(struct file *file, size_t *hlen) { struct tun_file *tfile; if (file->f_op != &tun_fops) @@ -3657,6 +3657,20 @@ struct socket *tun_get_socket(struct file *file) tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); + + if (hlen) { + struct tun_struct *tun = tun_get(tfile); + size_t len = 0; + + if (!tun) + return ERR_PTR(-ENOTCONN); + if (tun->flags & IFF_VNET_HDR) + len += READ_ONCE(tun->vnet_hdr_sz); + if (!(tun->flags & IFF_NO_PI)) + len += sizeof(struct tun_pi); + tun_put(tun); + *hlen = len; + } return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index df82b124170e..b92a7144ed90 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1143,7 +1143,8 @@ static void handle_rx(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; - mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF) && + (vhost_hlen || sock_hlen >= sizeof(num_buffers)); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, @@ -1213,9 +1214,10 @@ static void handle_rx(struct vhost_net *net) } } else { /* Header came from socket; we'll need to patch - * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF + * ->num_buffers over the last two bytes if + * VIRTIO_NET_F_MRG_RXBUF is enabled. */ - iov_iter_advance(&fixup, sizeof(hdr)); + iov_iter_advance(&fixup, sock_hlen - 2); } /* TODO: Should check and handle checksum. */ @@ -1420,7 +1422,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) return 0; } -static struct socket *get_raw_socket(int fd) +static struct socket *get_raw_socket(int fd, size_t *hlen) { int r; struct socket *sock = sockfd_lookup(fd, &r); @@ -1438,6 +1440,7 @@ static struct socket *get_raw_socket(int fd) r = -EPFNOSUPPORT; goto err; } + *hlen = 0; return sock; err: sockfd_put(sock); @@ -1463,33 +1466,33 @@ static struct ptr_ring *get_tap_ptr_ring(int fd) return ring; } -static struct socket *get_tap_socket(int fd) +static struct socket *get_tap_socket(int fd, size_t *hlen) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); - sock = tun_get_socket(file); + sock = tun_get_socket(file, hlen); if (!IS_ERR(sock)) return sock; - sock = tap_get_socket(file); + sock = tap_get_socket(file, hlen); if (IS_ERR(sock)) fput(file); return sock; } -static struct socket *get_socket(int fd) +static struct socket *get_socket(int fd, size_t *hlen) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; - sock = get_raw_socket(fd); + sock = get_raw_socket(fd, hlen); if (!IS_ERR(sock)) return sock; - sock = get_tap_socket(fd); + sock = get_tap_socket(fd, hlen); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); @@ -1521,7 +1524,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) r = -EFAULT; goto err_vq; } - sock = get_socket(fd); + sock = get_socket(fd, &nvq->sock_hlen); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; @@ -1621,7 +1624,7 @@ static long vhost_net_reset_owner(struct vhost_net *n) static int vhost_net_set_features(struct vhost_net *n, u64 features) { - size_t vhost_hlen, sock_hlen, hdr_len; + size_t vhost_hlen, hdr_len; int i; hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | @@ -1631,11 +1634,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; - sock_hlen = 0; } else { - /* socket provides vnet_hdr */ vhost_hlen = 0; - sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && @@ -1651,7 +1651,6 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vq.acked_features = features; n->vqs[i].vhost_hlen = vhost_hlen; - n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); } mutex_unlock(&n->dev.mutex); diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 915a187cfabd..b460ba98f34e 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -3,14 +3,14 @@ #define _LINUX_IF_TAP_H_ #if IS_ENABLED(CONFIG_TAP) -struct socket *tap_get_socket(struct file *); +struct socket *tap_get_socket(struct file *, size_t *); struct ptr_ring *tap_get_ptr_ring(struct file *file); #else #include <linux/err.h> #include <linux/errno.h> struct file; struct socket; -static inline struct socket *tap_get_socket(struct file *f) +static inline struct socket *tap_get_socket(struct file *f, size_t *) { return ERR_PTR(-EINVAL); } diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 2a7660843444..8a7debd3f663 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -25,7 +25,7 @@ struct tun_xdp_hdr { }; #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) -struct socket *tun_get_socket(struct file *); +struct socket *tun_get_socket(struct file *, size_t *); struct ptr_ring *tun_get_tx_ring(struct file *file); static inline bool tun_is_xdp_frame(void *ptr) { @@ -45,7 +45,7 @@ void tun_ptr_free(void *ptr); #include <linux/errno.h> struct file; struct socket; -static inline struct socket *tun_get_socket(struct file *f) +static inline struct socket *tun_get_socket(struct file *f, size_t *) { return ERR_PTR(-EINVAL); }