Message ID | 20250204-vsock-linger-nullderef-v1-2-6eb1760fa93e@rbox.co (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | vsock: null-ptr-deref when SO_LINGER enabled | expand |
On Tue, Feb 04, 2025 at 01:29:53AM +0100, Michal Luczaj wrote: >Explicitly close() a TCP_ESTABLISHED (connectible) socket with SO_LINGER >enabled. May trigger a null pointer dereference. > >Signed-off-by: Michal Luczaj <mhal@rbox.co> >--- > tools/testing/vsock/vsock_test.c | 41 ++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 41 insertions(+) > >diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c >index dfff8b288265f96b602cb1bfa0e6dce02f114222..d0f6d253ac72d08a957cb81a3c38fcc72bec5a53 100644 >--- a/tools/testing/vsock/vsock_test.c >+++ b/tools/testing/vsock/vsock_test.c >@@ -1788,6 +1788,42 @@ static void test_stream_connect_retry_server(const struct test_opts *opts) > close(fd); > } > >+static void test_stream_linger_client(const struct test_opts *opts) >+{ >+ struct linger optval = { >+ .l_onoff = 1, >+ .l_linger = 1 >+ }; >+ int fd; >+ >+ fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); >+ if (fd < 0) { >+ perror("connect"); >+ exit(EXIT_FAILURE); >+ } >+ >+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { >+ perror("setsockopt(SO_LINGER)"); >+ exit(EXIT_FAILURE); >+ } Since we are testing SO_LINGER, will also be nice to check if it's working properly, since one of the fixes proposed could break it. To test, we may set a small SO_VM_SOCKETS_BUFFER_SIZE on the receive side and try to send more than that value, obviously without reading anything into the receiver, and check that close() here, returns after the timeout we set in .l_linger. Of course, we could also add it later, but while we're at it, it crossed my mind. WDYT? Thanks, Stefano >+ >+ close(fd); >+} >+ >+static void test_stream_linger_server(const struct test_opts *opts) >+{ >+ int fd; >+ >+ fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); >+ if (fd < 0) { >+ perror("accept"); >+ exit(EXIT_FAILURE); >+ } >+ >+ vsock_wait_remote_close(fd); >+ close(fd); >+} >+ > static struct test_case test_cases[] = { > { > .name = "SOCK_STREAM connection reset", >@@ -1943,6 +1979,11 @@ static struct test_case test_cases[] = { > .run_client = test_stream_connect_retry_client, > .run_server = test_stream_connect_retry_server, > }, >+ { >+ .name = "SOCK_STREAM SO_LINGER null-ptr-deref", >+ .run_client = test_stream_linger_client, >+ .run_server = test_stream_linger_server, >+ }, > {}, > }; > > >-- >2.48.1 >
On 2/4/25 11:48, Stefano Garzarella wrote: > On Tue, Feb 04, 2025 at 01:29:53AM +0100, Michal Luczaj wrote: >> ... >> +static void test_stream_linger_client(const struct test_opts *opts) >> +{ >> + struct linger optval = { >> + .l_onoff = 1, >> + .l_linger = 1 >> + }; >> + int fd; >> + >> + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); >> + if (fd < 0) { >> + perror("connect"); >> + exit(EXIT_FAILURE); >> + } >> + >> + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { >> + perror("setsockopt(SO_LINGER)"); >> + exit(EXIT_FAILURE); >> + } > > Since we are testing SO_LINGER, will also be nice to check if it's > working properly, since one of the fixes proposed could break it. > > To test, we may set a small SO_VM_SOCKETS_BUFFER_SIZE on the receive > side and try to send more than that value, obviously without reading > anything into the receiver, and check that close() here, returns after > the timeout we set in .l_linger. I may be doing something wrong, but (at least for loopback transport) it seems that close() lingers until data is received, not sent (without even touching SO_VM_SOCKETS_BUFFER_SIZE). ``` import struct, fcntl, termios, time from socket import * def linger(s, timeout): if s.family == AF_VSOCK: s.setsockopt(SOL_SOCKET, SO_LINGER, (timeout<<32) | 1) elif s.family == AF_INET: s.setsockopt(SOL_SOCKET, SO_LINGER, struct.pack('ii', 1, timeout)) else: assert False def unsent(s): SIOCOUTQ = termios.TIOCOUTQ return struct.unpack('I', fcntl.ioctl(s, SIOCOUTQ, bytes(4)))[0] def check_lingering(family, addr): lis = socket(family, SOCK_STREAM) lis.bind(addr) lis.listen() s = socket(family, SOCK_STREAM) linger(s, 2) s.connect(lis.getsockname()) for _ in range(1, 1<<8): s.send(b'x') while unsent(s) != 0: pass print("closing...") ts = time.time() s.close() print(f"done in %ds" % (time.time() - ts)) check_lingering(AF_INET, ('127.0.0.1', 1234)) check_lingering(AF_VSOCK, (1, 1234)) # VMADDR_CID_LOCAL ``` Gives me: closing... done in 0s closing... done in 2s
On Wed, Feb 05, 2025 at 12:20:56PM +0100, Michal Luczaj wrote: >On 2/4/25 11:48, Stefano Garzarella wrote: >> On Tue, Feb 04, 2025 at 01:29:53AM +0100, Michal Luczaj wrote: >>> ... >>> +static void test_stream_linger_client(const struct test_opts *opts) >>> +{ >>> + struct linger optval = { >>> + .l_onoff = 1, >>> + .l_linger = 1 >>> + }; >>> + int fd; >>> + >>> + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); >>> + if (fd < 0) { >>> + perror("connect"); >>> + exit(EXIT_FAILURE); >>> + } >>> + >>> + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { >>> + perror("setsockopt(SO_LINGER)"); >>> + exit(EXIT_FAILURE); >>> + } >> >> Since we are testing SO_LINGER, will also be nice to check if it's >> working properly, since one of the fixes proposed could break it. >> >> To test, we may set a small SO_VM_SOCKETS_BUFFER_SIZE on the receive >> side and try to send more than that value, obviously without reading >> anything into the receiver, and check that close() here, returns after >> the timeout we set in .l_linger. > >I may be doing something wrong, but (at least for loopback transport) it Also with VMs is the same, I think virtio_transport_wait_close() can be improved to check if everything is sent, avoiding to wait. But this is material for another series, so this test should be fine for now! Thanks, Stefano >seems that close() lingers until data is received, not sent (without even >touching SO_VM_SOCKETS_BUFFER_SIZE). > >``` >import struct, fcntl, termios, time >from socket import * > >def linger(s, timeout): > if s.family == AF_VSOCK: > s.setsockopt(SOL_SOCKET, SO_LINGER, (timeout<<32) | 1) > elif s.family == AF_INET: > s.setsockopt(SOL_SOCKET, SO_LINGER, struct.pack('ii', 1, timeout)) > else: > assert False > >def unsent(s): > SIOCOUTQ = termios.TIOCOUTQ > return struct.unpack('I', fcntl.ioctl(s, SIOCOUTQ, bytes(4)))[0] > >def check_lingering(family, addr): > lis = socket(family, SOCK_STREAM) > lis.bind(addr) > lis.listen() > > s = socket(family, SOCK_STREAM) > linger(s, 2) > s.connect(lis.getsockname()) > > for _ in range(1, 1<<8): > s.send(b'x') > > while unsent(s) != 0: > pass > > print("closing...") > ts = time.time() > s.close() > print(f"done in %ds" % (time.time() - ts)) > >check_lingering(AF_INET, ('127.0.0.1', 1234)) >check_lingering(AF_VSOCK, (1, 1234)) # VMADDR_CID_LOCAL >``` > >Gives me: >closing... >done in 0s >closing... >done in 2s >
On 2/10/25 11:18, Stefano Garzarella wrote: > On Wed, Feb 05, 2025 at 12:20:56PM +0100, Michal Luczaj wrote: >> On 2/4/25 11:48, Stefano Garzarella wrote: >>> On Tue, Feb 04, 2025 at 01:29:53AM +0100, Michal Luczaj wrote: >>>> ... >>>> +static void test_stream_linger_client(const struct test_opts *opts) >>>> +{ >>>> + struct linger optval = { >>>> + .l_onoff = 1, >>>> + .l_linger = 1 >>>> + }; >>>> + int fd; >>>> + >>>> + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); >>>> + if (fd < 0) { >>>> + perror("connect"); >>>> + exit(EXIT_FAILURE); >>>> + } >>>> + >>>> + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { >>>> + perror("setsockopt(SO_LINGER)"); >>>> + exit(EXIT_FAILURE); >>>> + } >>> >>> Since we are testing SO_LINGER, will also be nice to check if it's >>> working properly, since one of the fixes proposed could break it. >>> >>> To test, we may set a small SO_VM_SOCKETS_BUFFER_SIZE on the receive >>> side and try to send more than that value, obviously without reading >>> anything into the receiver, and check that close() here, returns after >>> the timeout we set in .l_linger. >> >> I may be doing something wrong, but (at least for loopback transport) ... > > Also with VMs is the same, I think virtio_transport_wait_close() can be > improved to check if everything is sent, avoiding to wait. What kind of improvement do you have in mind? I've tried modifying the loop to make close()/shutdown() linger until unsent_bytes() == 0. No idea if this is acceptable: diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 9e85424c8343..bd8b88d70423 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -221,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk, long timeout); /**** TAP ****/ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7e3db87ae433..2cf7571e94da 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1013,6 +1013,25 @@ static int vsock_getname(struct socket *sock, return err; } +void vsock_linger(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); + ssize_t (*unsent)(struct vsock_sock *vsk); + struct vsock_sock *vsk = vsock_sk(sk); + + add_wait_queue(sk_sleep(sk), &wait); + unsent = vsk->transport->unsent_bytes; + + do { + if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) + break; + } while (!signal_pending(current) && timeout); + + remove_wait_queue(sk_sleep(sk), &wait); + } +} + static int vsock_shutdown(struct socket *sock, int mode) { int err; @@ -1056,6 +1075,8 @@ static int vsock_shutdown(struct socket *sock, int mode) if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); + if (sock_flag(sk, SOCK_LINGER)) + vsock_linger(sk, sk->sk_lingertime); } } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d88096..9230b8358ef2 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1192,23 +1192,6 @@ static void virtio_transport_remove_sock(struct vsock_sock *vsk) vsock_remove_sock(vsk); } -static void virtio_transport_wait_close(struct sock *sk, long timeout) -{ - if (timeout) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - - do { - if (sk_wait_event(sk, &timeout, - sock_flag(sk, SOCK_DONE), &wait)) - break; - } while (!signal_pending(current) && timeout); - - remove_wait_queue(sk_sleep(sk), &wait); - } -} - static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, bool cancel_timeout) { @@ -1279,7 +1262,7 @@ static bool virtio_transport_close(struct vsock_sock *vsk) (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) - virtio_transport_wait_close(sk, sk->sk_lingertime); + vsock_linger(sk, sk->sk_lingertime); if (sock_flag(sk, SOCK_DONE)) { return true; This works, but I find it difficult to test without artificially slowing the kernel down. It's a race against workers as they quite eagerly do virtio_transport_consume_skb_sent(), which decrements vvs->bytes_unsent. I've tried reducing SO_VM_SOCKETS_BUFFER_SIZE as you've suggested, but send() would just block until peer had available space. Thanks, Michal
On Fri, Mar 07, 2025 at 10:49:52AM +0100, Michal Luczaj wrote: >On 2/10/25 11:18, Stefano Garzarella wrote: >> On Wed, Feb 05, 2025 at 12:20:56PM +0100, Michal Luczaj wrote: >>> On 2/4/25 11:48, Stefano Garzarella wrote: >>>> On Tue, Feb 04, 2025 at 01:29:53AM +0100, Michal Luczaj wrote: >>>>> ... >>>>> +static void test_stream_linger_client(const struct test_opts *opts) >>>>> +{ >>>>> + struct linger optval = { >>>>> + .l_onoff = 1, >>>>> + .l_linger = 1 >>>>> + }; >>>>> + int fd; >>>>> + >>>>> + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); >>>>> + if (fd < 0) { >>>>> + perror("connect"); >>>>> + exit(EXIT_FAILURE); >>>>> + } >>>>> + >>>>> + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { >>>>> + perror("setsockopt(SO_LINGER)"); >>>>> + exit(EXIT_FAILURE); >>>>> + } >>>> >>>> Since we are testing SO_LINGER, will also be nice to check if it's >>>> working properly, since one of the fixes proposed could break it. >>>> >>>> To test, we may set a small SO_VM_SOCKETS_BUFFER_SIZE on the receive >>>> side and try to send more than that value, obviously without reading >>>> anything into the receiver, and check that close() here, returns after >>>> the timeout we set in .l_linger. >>> >>> I may be doing something wrong, but (at least for loopback transport) ... >> >> Also with VMs is the same, I think virtio_transport_wait_close() can be >> improved to check if everything is sent, avoiding to wait. > >What kind of improvement do you have in mind? > >I've tried modifying the loop to make close()/shutdown() linger until >unsent_bytes() == 0. No idea if this is acceptable: Yes, that's a good idea, I had something similar in mind, but reusing unsent_bytes() sounds great to me. The only problem I see is that in the driver in the guest, the packets are put in the virtqueue and the variable is decremented only when the host sends us an interrupt to say that it has copied the packets and then the guest can free the buffer. Is this okay to consider this as sending? I think so, though it's honestly not clear to me if instead by sending we should consider when the driver copies the bytes into the virtqueue, but that doesn't mean they were really sent. We should compare it to what the network devices or AF_UNIX do. > >diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h >index 9e85424c8343..bd8b88d70423 100644 >--- a/include/net/af_vsock.h >+++ b/include/net/af_vsock.h >@@ -221,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, > void (*fn)(struct sock *sk)); > int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); > bool vsock_find_cid(unsigned int cid); >+void vsock_linger(struct sock *sk, long timeout); > > /**** TAP ****/ > >diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c >index 7e3db87ae433..2cf7571e94da 100644 >--- a/net/vmw_vsock/af_vsock.c >+++ b/net/vmw_vsock/af_vsock.c >@@ -1013,6 +1013,25 @@ static int vsock_getname(struct socket *sock, > return err; > } > >+void vsock_linger(struct sock *sk, long timeout) >+{ >+ if (timeout) { >+ DEFINE_WAIT_FUNC(wait, woken_wake_function); >+ ssize_t (*unsent)(struct vsock_sock *vsk); >+ struct vsock_sock *vsk = vsock_sk(sk); >+ >+ add_wait_queue(sk_sleep(sk), &wait); >+ unsent = vsk->transport->unsent_bytes; This is not implemented by all transports, so we should handle it in some way (check the pointer or implement in all transports). >+ >+ do { >+ if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) >+ break; >+ } while (!signal_pending(current) && timeout); >+ >+ remove_wait_queue(sk_sleep(sk), &wait); >+ } >+} >+ > static int vsock_shutdown(struct socket *sock, int mode) > { > int err; >@@ -1056,6 +1075,8 @@ static int vsock_shutdown(struct socket *sock, int mode) > if (sock_type_connectible(sk->sk_type)) { > sock_reset_flag(sk, SOCK_DONE); > vsock_send_shutdown(sk, mode); >+ if (sock_flag(sk, SOCK_LINGER)) >+ vsock_linger(sk, sk->sk_lingertime); mmm, great, so on shutdown we never supported SO_LINGER, right? > } > } > >diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c >index 7f7de6d88096..9230b8358ef2 100644 >--- a/net/vmw_vsock/virtio_transport_common.c >+++ b/net/vmw_vsock/virtio_transport_common.c >@@ -1192,23 +1192,6 @@ static void virtio_transport_remove_sock(struct vsock_sock *vsk) > vsock_remove_sock(vsk); > } > >-static void virtio_transport_wait_close(struct sock *sk, long timeout) >-{ >- if (timeout) { >- DEFINE_WAIT_FUNC(wait, woken_wake_function); >- >- add_wait_queue(sk_sleep(sk), &wait); >- >- do { >- if (sk_wait_event(sk, &timeout, >- sock_flag(sk, SOCK_DONE), &wait)) >- break; >- } while (!signal_pending(current) && timeout); >- >- remove_wait_queue(sk_sleep(sk), &wait); >- } >-} >- > static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, > bool cancel_timeout) > { >@@ -1279,7 +1262,7 @@ static bool virtio_transport_close(struct vsock_sock *vsk) > (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); > > if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) >- virtio_transport_wait_close(sk, sk->sk_lingertime); >+ vsock_linger(sk, sk->sk_lingertime); > > if (sock_flag(sk, SOCK_DONE)) { > return true; > > >This works, but I find it difficult to test without artificially slowing >the kernel down. It's a race against workers as they quite eagerly do >virtio_transport_consume_skb_sent(), which decrements vvs->bytes_unsent. >I've tried reducing SO_VM_SOCKETS_BUFFER_SIZE as you've suggested, but >send() would just block until peer had available space. Did you test with loopback or virtio-vsock with a VM? BTW this approach LGTM! Thanks, Stefano
On 3/10/25 16:24, Stefano Garzarella wrote: > On Fri, Mar 07, 2025 at 10:49:52AM +0100, Michal Luczaj wrote: >> ... >> I've tried modifying the loop to make close()/shutdown() linger until >> unsent_bytes() == 0. No idea if this is acceptable: > > Yes, that's a good idea, I had something similar in mind, but reusing > unsent_bytes() sounds great to me. > > The only problem I see is that in the driver in the guest, the packets > are put in the virtqueue and the variable is decremented only when the > host sends us an interrupt to say that it has copied the packets and > then the guest can free the buffer. Is this okay to consider this as > sending? > > I think so, though it's honestly not clear to me if instead by sending > we should consider when the driver copies the bytes into the virtqueue, > but that doesn't mean they were really sent. We should compare it to > what the network devices or AF_UNIX do. I had a look at AF_UNIX. SO_LINGER is not supported. Which makes sense; when you send a packet, it directly lands in receiver's queue. As for SIOCOUTQ handling: `return sk_wmem_alloc_get(sk)`. So I guess it's more of an "unread bytes"? >> +void vsock_linger(struct sock *sk, long timeout) >> +{ >> + if (timeout) { >> + DEFINE_WAIT_FUNC(wait, woken_wake_function); >> + ssize_t (*unsent)(struct vsock_sock *vsk); >> + struct vsock_sock *vsk = vsock_sk(sk); >> + >> + add_wait_queue(sk_sleep(sk), &wait); >> + unsent = vsk->transport->unsent_bytes; > > This is not implemented by all transports, so we should handle it in > some way (check the pointer or implement in all transports). Ah, right, I didn't think that through. >> @@ -1056,6 +1075,8 @@ static int vsock_shutdown(struct socket *sock, int mode) >> if (sock_type_connectible(sk->sk_type)) { >> sock_reset_flag(sk, SOCK_DONE); >> vsock_send_shutdown(sk, mode); >> + if (sock_flag(sk, SOCK_LINGER)) >> + vsock_linger(sk, sk->sk_lingertime); > > mmm, great, so on shutdown we never supported SO_LINGER, right? Yup. >> ... >> This works, but I find it difficult to test without artificially slowing >> the kernel down. It's a race against workers as they quite eagerly do >> virtio_transport_consume_skb_sent(), which decrements vvs->bytes_unsent. >> I've tried reducing SO_VM_SOCKETS_BUFFER_SIZE as you've suggested, but >> send() would just block until peer had available space. > > Did you test with loopback or virtio-vsock with a VM? Both, but I may be missing something. Do you see a way to stop (or don't schedule) the worker from processing queue (and decrementing bytes_unsent)? Thanks, Michal
On Fri, Mar 14, 2025 at 04:25:16PM +0100, Michal Luczaj wrote: >On 3/10/25 16:24, Stefano Garzarella wrote: >> On Fri, Mar 07, 2025 at 10:49:52AM +0100, Michal Luczaj wrote: >>> ... >>> I've tried modifying the loop to make close()/shutdown() linger until >>> unsent_bytes() == 0. No idea if this is acceptable: >> >> Yes, that's a good idea, I had something similar in mind, but reusing >> unsent_bytes() sounds great to me. >> >> The only problem I see is that in the driver in the guest, the packets >> are put in the virtqueue and the variable is decremented only when the >> host sends us an interrupt to say that it has copied the packets and >> then the guest can free the buffer. Is this okay to consider this as >> sending? >> >> I think so, though it's honestly not clear to me if instead by sending >> we should consider when the driver copies the bytes into the virtqueue, >> but that doesn't mean they were really sent. We should compare it to >> what the network devices or AF_UNIX do. > >I had a look at AF_UNIX. SO_LINGER is not supported. Which makes sense; >when you send a packet, it directly lands in receiver's queue. As for >SIOCOUTQ handling: `return sk_wmem_alloc_get(sk)`. So I guess it's more of >an "unread bytes"? Yes, I see, actually for AF_UNIX it is simple. It's hard for us to tell when the user on the other pear actually read the data, we could use the credit mechanism, but that sometimes isn't sent unless explicitly requested, so I'd say unsent_bytes() is fine. > >>> +void vsock_linger(struct sock *sk, long timeout) >>> +{ >>> + if (timeout) { >>> + DEFINE_WAIT_FUNC(wait, woken_wake_function); >>> + ssize_t (*unsent)(struct vsock_sock *vsk); >>> + struct vsock_sock *vsk = vsock_sk(sk); >>> + >>> + add_wait_queue(sk_sleep(sk), &wait); >>> + unsent = vsk->transport->unsent_bytes; >> >> This is not implemented by all transports, so we should handle it in >> some way (check the pointer or implement in all transports). > >Ah, right, I didn't think that through. > >>> @@ -1056,6 +1075,8 @@ static int vsock_shutdown(struct socket *sock, int mode) >>> if (sock_type_connectible(sk->sk_type)) { >>> sock_reset_flag(sk, SOCK_DONE); >>> vsock_send_shutdown(sk, mode); >>> + if (sock_flag(sk, SOCK_LINGER)) >>> + vsock_linger(sk, sk->sk_lingertime); >> >> mmm, great, so on shutdown we never supported SO_LINGER, right? > >Yup. > >>> ... >>> This works, but I find it difficult to test without artificially slowing >>> the kernel down. It's a race against workers as they quite eagerly do >>> virtio_transport_consume_skb_sent(), which decrements vvs->bytes_unsent. >>> I've tried reducing SO_VM_SOCKETS_BUFFER_SIZE as you've suggested, but >>> send() would just block until peer had available space. >> >> Did you test with loopback or virtio-vsock with a VM? > >Both, but I may be missing something. Do you see a way to stop (or don't >schedule) the worker from processing queue (and decrementing bytes_unsent)? Without touching the driver (which I don't want to do) I can't think of anything, so I'd say it's okay. Thanks, Stefano
On 3/20/25 12:31, Stefano Garzarella wrote: > On Fri, Mar 14, 2025 at 04:25:16PM +0100, Michal Luczaj wrote: >> On 3/10/25 16:24, Stefano Garzarella wrote: >>> On Fri, Mar 07, 2025 at 10:49:52AM +0100, Michal Luczaj wrote: >>>> ... >>>> I've tried modifying the loop to make close()/shutdown() linger until >>>> unsent_bytes() == 0. No idea if this is acceptable: >>> >>> Yes, that's a good idea, I had something similar in mind, but reusing >>> unsent_bytes() sounds great to me. >>> >>> The only problem I see is that in the driver in the guest, the packets >>> are put in the virtqueue and the variable is decremented only when the >>> host sends us an interrupt to say that it has copied the packets and >>> then the guest can free the buffer. Is this okay to consider this as >>> sending? >>> >>> I think so, though it's honestly not clear to me if instead by sending >>> we should consider when the driver copies the bytes into the virtqueue, >>> but that doesn't mean they were really sent. We should compare it to >>> what the network devices or AF_UNIX do. >> >> I had a look at AF_UNIX. SO_LINGER is not supported. Which makes sense; >> when you send a packet, it directly lands in receiver's queue. As for >> SIOCOUTQ handling: `return sk_wmem_alloc_get(sk)`. So I guess it's more of >> an "unread bytes"? > > Yes, I see, actually for AF_UNIX it is simple. > It's hard for us to tell when the user on the other pear actually read > the data, we could use the credit mechanism, but that sometimes isn't > sent unless explicitly requested, so I'd say unsent_bytes() is fine. One more option: keep the semantics (in a state of not-what-`man 7 socket`- says) and, for completeness, add the lingering to shutdown()? >>>> ... >>>> This works, but I find it difficult to test without artificially slowing >>>> the kernel down. It's a race against workers as they quite eagerly do >>>> virtio_transport_consume_skb_sent(), which decrements vvs->bytes_unsent. >>>> I've tried reducing SO_VM_SOCKETS_BUFFER_SIZE as you've suggested, but >>>> send() would just block until peer had available space. >>> >>> Did you test with loopback or virtio-vsock with a VM? >> >> Both, but I may be missing something. Do you see a way to stop (or don't >> schedule) the worker from processing queue (and decrementing bytes_unsent)? > > Without touching the driver (which I don't want to do) I can't think of > anything, so I'd say it's okay. Turns out there's a way to purge the loopback queue before worker processes it (I had no success with g2h). If you win that race, bytes_unsent stays elevated until kingdom come. Then you can close() the socket and watch as it lingers. connect(s) lock_sock while (sk_state != TCP_ESTABLISHED) release_sock schedule_timeout // virtio_transport_recv_connecting // sk_state = TCP_ESTABLISHED send(s, 'x') lock_sock virtio_transport_send_pkt_info virtio_transport_get_credit (!) vvs->bytes_unsent += ret vsock_loopback_send_pkt virtio_vsock_skb_queue_tail release_sock kill() lock_sock if signal_pending vsock_loopback_cancel_pkt virtio_transport_purge_skbs (!) That said, I may be missing a bigger picture, but is it worth supporting this "signal disconnects TCP_ESTABLISHED" behaviour in the first place? Removing it would make the race above (and the whole [1] series) moot. Plus, it appears to be broken: when I hit this condition and I try to re-connect to the same listener, I get ETIMEDOUT for loopback and ECONNRESET for g2h virtio; see [2]. [1]: https://lore.kernel.org/netdev/20250317-vsock-trans-signal-race-v4-0-fc8837f3f1d4@rbox.co/ [2]: Inspired by Luigi's code, which I mauled terribly: diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index d0f6d253ac72..aa4a321ddd9c 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -23,6 +23,7 @@ #include <sys/ioctl.h> #include <linux/sockios.h> #include <linux/time64.h> +#include <pthread.h> #include "vsock_test_zerocopy.h" #include "timeout.h" @@ -1824,6 +1825,104 @@ static void test_stream_linger_server(const struct test_opts *opts) close(fd); } +static void handler(int signum) +{ + /* nop */ +} + +static void *killer(void *arg) +{ + pid_t pid = getpid(); + + if ((errno = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL))) { + perror("pthread_setcanceltype"); + exit(EXIT_FAILURE); + } + + for (;;) { + if (kill(pid, SIGUSR1)) { + perror("kill"); + exit(EXIT_FAILURE); + } + } + + return NULL; +} + +static void client(const struct test_opts *opts) +{ + struct sockaddr_vm addr = { + .svm_family = AF_VSOCK, + .svm_cid = opts->peer_cid, + .svm_port = opts->peer_port, + }; + sighandler_t old_handler; + bool reconnect = false; + pthread_t tid; + time_t tout; + int c; + + old_handler = signal(SIGUSR1, handler); + if (old_handler == SIG_ERR) { + perror("signal"); + exit(EXIT_FAILURE); + } + + if ((errno = pthread_create(&tid, NULL, killer, NULL))) { + perror("pthread_create"); + exit(EXIT_FAILURE); + } + + tout = current_nsec() + 2 * NSEC_PER_SEC; + do { + c = socket(AF_VSOCK, SOCK_STREAM, 0); + if (c < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + if (connect(c, (struct sockaddr *)&addr, sizeof(addr)) && + errno == EINTR) { + reconnect = true; + break; + } + + close(c); + } while (current_nsec() < tout); + + if ((errno = pthread_cancel(tid))) { + perror("pthread_cancel"); + exit(EXIT_FAILURE); + } + + if ((errno = pthread_join(tid, NULL))) { + perror("pthread_join"); + exit(EXIT_FAILURE); + } + + if (signal(SIGUSR1, old_handler) == SIG_ERR) { + perror("signal"); + exit(EXIT_FAILURE); + } + + if (reconnect) { + if (connect(c, (struct sockaddr *)&addr, sizeof(addr))) { + perror("re-connect() after EINTR"); + exit(EXIT_FAILURE); + } + close(c); + } + + control_writeln("DONE"); +} + +static void server(const struct test_opts *opts) +{ + int s = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port); + control_expectln("DONE"); + close(s); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1984,6 +2083,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_linger_client, .run_server = test_stream_linger_server, }, + { + .name = "SOCK_STREAM connect -> EINTR -> connect", + .run_client = client, + .run_server = server, + }, {}, };
On Tue, Mar 25, 2025 at 02:22:45PM +0100, Michal Luczaj wrote: >On 3/20/25 12:31, Stefano Garzarella wrote: >> On Fri, Mar 14, 2025 at 04:25:16PM +0100, Michal Luczaj wrote: >>> On 3/10/25 16:24, Stefano Garzarella wrote: >>>> On Fri, Mar 07, 2025 at 10:49:52AM +0100, Michal Luczaj wrote: >>>>> ... >>>>> I've tried modifying the loop to make close()/shutdown() linger until >>>>> unsent_bytes() == 0. No idea if this is acceptable: >>>> >>>> Yes, that's a good idea, I had something similar in mind, but reusing >>>> unsent_bytes() sounds great to me. >>>> >>>> The only problem I see is that in the driver in the guest, the packets >>>> are put in the virtqueue and the variable is decremented only when the >>>> host sends us an interrupt to say that it has copied the packets and >>>> then the guest can free the buffer. Is this okay to consider this as >>>> sending? >>>> >>>> I think so, though it's honestly not clear to me if instead by sending >>>> we should consider when the driver copies the bytes into the virtqueue, >>>> but that doesn't mean they were really sent. We should compare it to >>>> what the network devices or AF_UNIX do. >>> >>> I had a look at AF_UNIX. SO_LINGER is not supported. Which makes sense; >>> when you send a packet, it directly lands in receiver's queue. As for >>> SIOCOUTQ handling: `return sk_wmem_alloc_get(sk)`. So I guess it's more of >>> an "unread bytes"? >> >> Yes, I see, actually for AF_UNIX it is simple. >> It's hard for us to tell when the user on the other pear actually read >> the data, we could use the credit mechanism, but that sometimes isn't >> sent unless explicitly requested, so I'd say unsent_bytes() is fine. > >One more option: keep the semantics (in a state of not-what-`man 7 socket`- >says) and, for completeness, add the lingering to shutdown()? Sorry, I'm getting lost! That's because we had a different behavior between close() and shutdown() right? If it's the case, I would say let's fix at least that for now. > >>>>> ... >>>>> This works, but I find it difficult to test without artificially slowing >>>>> the kernel down. It's a race against workers as they quite eagerly do >>>>> virtio_transport_consume_skb_sent(), which decrements vvs->bytes_unsent. >>>>> I've tried reducing SO_VM_SOCKETS_BUFFER_SIZE as you've suggested, but >>>>> send() would just block until peer had available space. >>>> >>>> Did you test with loopback or virtio-vsock with a VM? >>> >>> Both, but I may be missing something. Do you see a way to stop (or don't >>> schedule) the worker from processing queue (and decrementing bytes_unsent)? >> >> Without touching the driver (which I don't want to do) I can't think of >> anything, so I'd say it's okay. > >Turns out there's a way to purge the loopback queue before worker processes >it (I had no success with g2h). If you win that race, bytes_unsent stays >elevated until kingdom come. Then you can close() the socket and watch as >it lingers. > >connect(s) > lock_sock > while (sk_state != TCP_ESTABLISHED) > release_sock > schedule_timeout > >// virtio_transport_recv_connecting >// sk_state = TCP_ESTABLISHED > > send(s, 'x') > lock_sock > virtio_transport_send_pkt_info > virtio_transport_get_credit > (!) vvs->bytes_unsent += ret > vsock_loopback_send_pkt > virtio_vsock_skb_queue_tail > release_sock > kill() > lock_sock > if signal_pending > vsock_loopback_cancel_pkt > virtio_transport_purge_skbs (!) > >That said, I may be missing a bigger picture, but is it worth supporting >this "signal disconnects TCP_ESTABLISHED" behaviour in the first place? Can you elaborate a bit? >Removing it would make the race above (and the whole [1] series) moot. >Plus, it appears to be broken: when I hit this condition and I try to >re-connect to the same listener, I get ETIMEDOUT for loopback and >ECONNRESET for g2h virtio; see [2]. Could this be related to the fix I sent some days ago? https://lore.kernel.org/netdev/20250328141528.420719-1-sgarzare@redhat.com/ Thanks, Stefano
On 4/1/25 12:32, Stefano Garzarella wrote: > On Tue, Mar 25, 2025 at 02:22:45PM +0100, Michal Luczaj wrote: >> On 3/20/25 12:31, Stefano Garzarella wrote: >>> On Fri, Mar 14, 2025 at 04:25:16PM +0100, Michal Luczaj wrote: >>>> On 3/10/25 16:24, Stefano Garzarella wrote: >>>>> On Fri, Mar 07, 2025 at 10:49:52AM +0100, Michal Luczaj wrote: >>>>>> ... >>>>>> I've tried modifying the loop to make close()/shutdown() linger until >>>>>> unsent_bytes() == 0. No idea if this is acceptable: >>>>> >>>>> Yes, that's a good idea, I had something similar in mind, but reusing >>>>> unsent_bytes() sounds great to me. >>>>> >>>>> The only problem I see is that in the driver in the guest, the packets >>>>> are put in the virtqueue and the variable is decremented only when the >>>>> host sends us an interrupt to say that it has copied the packets and >>>>> then the guest can free the buffer. Is this okay to consider this as >>>>> sending? >>>>> >>>>> I think so, though it's honestly not clear to me if instead by sending >>>>> we should consider when the driver copies the bytes into the virtqueue, >>>>> but that doesn't mean they were really sent. We should compare it to >>>>> what the network devices or AF_UNIX do. >>>> >>>> I had a look at AF_UNIX. SO_LINGER is not supported. Which makes sense; >>>> when you send a packet, it directly lands in receiver's queue. As for >>>> SIOCOUTQ handling: `return sk_wmem_alloc_get(sk)`. So I guess it's more of >>>> an "unread bytes"? >>> >>> Yes, I see, actually for AF_UNIX it is simple. >>> It's hard for us to tell when the user on the other pear actually read >>> the data, we could use the credit mechanism, but that sometimes isn't >>> sent unless explicitly requested, so I'd say unsent_bytes() is fine. >> >> One more option: keep the semantics (in a state of not-what-`man 7 socket`- >> says) and, for completeness, add the lingering to shutdown()? > > Sorry, I'm getting lost! > That's because we had a different behavior between close() and > shutdown() right? > > If it's the case, I would say let's fix at least that for now. Yeah, okay, let's keep things simple. I'll post the patch once net-next opens. >>>>>> ... >>>>>> This works, but I find it difficult to test without artificially slowing >>>>>> the kernel down. It's a race against workers as they quite eagerly do >>>>>> virtio_transport_consume_skb_sent(), which decrements vvs->bytes_unsent. >>>>>> I've tried reducing SO_VM_SOCKETS_BUFFER_SIZE as you've suggested, but >>>>>> send() would just block until peer had available space. >>>>> >>>>> Did you test with loopback or virtio-vsock with a VM? >>>> >>>> Both, but I may be missing something. Do you see a way to stop (or don't >>>> schedule) the worker from processing queue (and decrementing bytes_unsent)? >>> >>> Without touching the driver (which I don't want to do) I can't think of >>> anything, so I'd say it's okay. >> >> Turns out there's a way to purge the loopback queue before worker processes >> it (I had no success with g2h). If you win that race, bytes_unsent stays >> elevated until kingdom come. Then you can close() the socket and watch as >> it lingers. >> >> connect(s) >> lock_sock >> while (sk_state != TCP_ESTABLISHED) >> release_sock >> schedule_timeout >> >> // virtio_transport_recv_connecting >> // sk_state = TCP_ESTABLISHED >> >> send(s, 'x') >> lock_sock >> virtio_transport_send_pkt_info >> virtio_transport_get_credit >> (!) vvs->bytes_unsent += ret >> vsock_loopback_send_pkt >> virtio_vsock_skb_queue_tail >> release_sock >> kill() >> lock_sock >> if signal_pending >> vsock_loopback_cancel_pkt >> virtio_transport_purge_skbs (!) >> >> That said, I may be missing a bigger picture, but is it worth supporting >> this "signal disconnects TCP_ESTABLISHED" behaviour in the first place? > > Can you elaborate a bit? There isn't much to it. I just wondered if connect() -- that has already established a connection -- could ignore the signal (or pretend it came too late), to avoid carrying out this kind of disconnect. >> Removing it would make the race above (and the whole [1] series) moot. >> Plus, it appears to be broken: when I hit this condition and I try to >> re-connect to the same listener, I get ETIMEDOUT for loopback and >> ECONNRESET for g2h virtio; see [2]. > > Could this be related to the fix I sent some days ago? > https://lore.kernel.org/netdev/20250328141528.420719-1-sgarzare@redhat.com/ I've tried that. I've also took a hint from your other mail and attempted flushing the listener queue, but to no avail. Crude code below. Is there something wrong with it? #include <stdio.h> #include <errno.h> #include <stdlib.h> #include <unistd.h> #include <signal.h> #include <pthread.h> #include <sys/socket.h> #include <linux/vm_sockets.h> static void die(const char *msg) { perror(msg); exit(-1); } static void barrier(pthread_barrier_t *barr) { errno = pthread_barrier_wait(barr); if (errno && errno != PTHREAD_BARRIER_SERIAL_THREAD) die("pthread_barrier_wait"); } static void flush_accept(int s) { int p = accept(s, NULL, NULL); if (p < 0) { if (errno != EAGAIN) perror("accept"); return; } printf("accept: drained\n"); close(p); } static void handler(int signum) { /* nop */ } void static set_accept_timeout(int s) { struct timeval tv = { .tv_sec = 1 }; if (setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) die("setsockopt SO_RCVTIMEO"); } void static set_connect_timeout(int s) { struct timeval tv = { .tv_sec = 1 }; if (setsockopt(s, AF_VSOCK, SO_VM_SOCKETS_CONNECT_TIMEOUT, &tv, sizeof(tv))) die("setsockopt SO_VM_SOCKETS_CONNECT_TIMEOUT"); } static void *killer(void *arg) { pthread_barrier_t *barr = (pthread_barrier_t *)arg; pid_t pid = getpid(); if ((errno = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL))) die("pthread_setcanceltype"); for (;;) { barrier(barr); if (kill(pid, SIGUSR1)) die("kill"); barrier(barr); } return NULL; } int main(void) { struct sockaddr_vm addr = { .svm_family = AF_VSOCK, .svm_cid = VMADDR_CID_LOCAL, .svm_port = 1234 }; socklen_t alen = sizeof(addr); pthread_barrier_t barr; pthread_t tid; int s, c; if ((errno = pthread_barrier_init(&barr, NULL, 2))) die("pthread_barrier_init"); if (signal(SIGUSR1, handler) == SIG_ERR) die("signal"); s = socket(AF_VSOCK, SOCK_STREAM, 0); if (s < 0) die("socket s"); set_accept_timeout(s); if (bind(s, (struct sockaddr *)&addr, alen)) die("bind"); if (listen(s, 64)) die("listen"); if ((errno = pthread_create(&tid, NULL, killer, &barr))) die("pthread_create"); for (;;) { c = socket(AF_VSOCK, SOCK_STREAM, 0); if (c < 0) die("socket c"); barrier(&barr); if (connect(c, (struct sockaddr *)&addr, sizeof(addr)) && errno == EINTR) { printf("connect: EINTR\n"); break; } barrier(&barr); close(c); flush_accept(s); } if ((errno = pthread_cancel(tid))) die("pthread_cancel"); if ((errno = pthread_join(tid, NULL))) die("pthread_join"); flush_accept(s); set_connect_timeout(c); if (connect(c, (struct sockaddr *)&addr, sizeof(addr))) die("re-connect"); printf("okay?\n"); return 0; }
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index dfff8b288265f96b602cb1bfa0e6dce02f114222..d0f6d253ac72d08a957cb81a3c38fcc72bec5a53 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1788,6 +1788,42 @@ static void test_stream_connect_retry_server(const struct test_opts *opts) close(fd); } +static void test_stream_linger_client(const struct test_opts *opts) +{ + struct linger optval = { + .l_onoff = 1, + .l_linger = 1 + }; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { + perror("setsockopt(SO_LINGER)"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_linger_server(const struct test_opts *opts) +{ + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1943,6 +1979,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_connect_retry_client, .run_server = test_stream_connect_retry_server, }, + { + .name = "SOCK_STREAM SO_LINGER null-ptr-deref", + .run_client = test_stream_linger_client, + .run_server = test_stream_linger_server, + }, {}, };
Explicitly close() a TCP_ESTABLISHED (connectible) socket with SO_LINGER enabled. May trigger a null pointer dereference. Signed-off-by: Michal Luczaj <mhal@rbox.co> --- tools/testing/vsock/vsock_test.c | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+)