diff mbox series

[v4,07/14] vfio-user: run vfio-user context

Message ID f7fdee9b5cde0f8ee8e99702f113ab22dc4167ea.1639549843.git.jag.raman@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user server in QEMU | expand

Commit Message

Jag Raman Dec. 15, 2021, 3:35 p.m. UTC
Setup a handler to run vfio-user context. The context is driven by
messages to the file descriptor associated with it - get the fd for
the context and hook up the handler with it

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 hw/remote/vfio-user-obj.c | 80 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

Comments

Stefan Hajnoczi Dec. 16, 2021, 11:17 a.m. UTC | #1
On Wed, Dec 15, 2021 at 10:35:31AM -0500, Jagannathan Raman wrote:
> @@ -114,6 +118,62 @@ static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
>      vfu_object_init_ctx(o, errp);
>  }
>  
> +static void vfu_object_ctx_run(void *opaque)
> +{
> +    VfuObject *o = opaque;
> +    int ret = -1;
> +
> +    while (ret != 0) {
> +        ret = vfu_run_ctx(o->vfu_ctx);
> +        if (ret < 0) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == ENOTCONN) {
> +                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> +                o->vfu_poll_fd = -1;
> +                object_unparent(OBJECT(o));
> +                break;

If nothing else logs a message then I think that should be done here so
users know why their vfio-user server object disappeared.

> +            } else {
> +                error_setg(&error_abort, "vfu: Failed to run device %s - %s",
> +                           o->device, strerror(errno));

error_abort is equivalent to assuming !o->daemon. In the case where the
user doesn't want to automatically shut down the process we need to log
a message without aborting.

> +                 break;

Indentation is off.

> +            }
> +        }
> +    }
> +}
> +
> +static void vfu_object_attach_ctx(void *opaque)
> +{
> +    VfuObject *o = opaque;
> +    GPollFD pfds[1];
> +    int ret;
> +
> +    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> +
> +    pfds[0].fd = o->vfu_poll_fd;
> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
> +
> +retry_attach:
> +    ret = vfu_attach_ctx(o->vfu_ctx);
> +    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
> +        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
> +        goto retry_attach;

This can block the thread indefinitely. Other events like monitor
commands are not handled in this loop. Please make this asynchronous
(set an fd handler and return from this function so we can try again
later).

The vfu_attach_ctx() implementation synchronously negotiates the
vfio-user connection :(. That's a shame because even if accept(2) is
handled asynchronously, the negotiation can still block. It would be
cleanest to have a fully async libvfio-user's vfu_attach_ctx() API to
avoid blocking. Is that possible?

If libvfio-user can't make vfu_attach_ctx() fully async then it may be
possible to spawn a thread just for the blocking vfu_attach_ctx() call
and report the result back to the event loop (e.g. using EventNotifier).
That adds a bunch of code to work around a blocking API though, so I
guess we can leave the blocking part if necessary.

At the very minimum, please make EAGAIN/EWOULDBLOCK async as mentioned
above and add a comment explaining the situation with the
partially-async vfu_attach_ctx() API so it's clear that this can block
(that way it's clear that you're aware of the issue and this isn't by
accident).

> +    } else if (ret < 0) {
> +        error_setg(&error_abort,
> +                   "vfu: Failed to attach device %s to context - %s",
> +                   o->device, strerror(errno));

error_abort assumes !o->daemon. Please handle the o->daemon == true
case by logging an error without aborting.

> +        return;
> +    }
> +
> +    o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
> +    if (o->vfu_poll_fd < 0) {
> +        error_setg(&error_abort, "vfu: Failed to get poll fd %s", o->device);

Same here.

> @@ -208,6 +284,8 @@ static void vfu_object_init(Object *obj)
>                     TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
>          return;
>      }
> +
> +    o->vfu_poll_fd = -1;
>  }

This must call qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL)
when o->vfu_poll_fd != -1 to avoid leaving a dangling fd handler
callback registered.
Jag Raman Dec. 17, 2021, 5:59 p.m. UTC | #2
> On Dec 16, 2021, at 6:17 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Wed, Dec 15, 2021 at 10:35:31AM -0500, Jagannathan Raman wrote:
>> @@ -114,6 +118,62 @@ static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
>>     vfu_object_init_ctx(o, errp);
>> }
>> 
>> +static void vfu_object_ctx_run(void *opaque)
>> +{
>> +    VfuObject *o = opaque;
>> +    int ret = -1;
>> +
>> +    while (ret != 0) {
>> +        ret = vfu_run_ctx(o->vfu_ctx);
>> +        if (ret < 0) {
>> +            if (errno == EINTR) {
>> +                continue;
>> +            } else if (errno == ENOTCONN) {
>> +                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
>> +                o->vfu_poll_fd = -1;
>> +                object_unparent(OBJECT(o));
>> +                break;
> 
> If nothing else logs a message then I think that should be done here so
> users know why their vfio-user server object disappeared.

Sure will do.

Do you prefer a trace, or a message to the console? Trace makes sense to me.
Presently, the client could unplug the vfio-user device which would trigger the
deletion of this object. This process could happen quietly.

> 
>> +            } else {
>> +                error_setg(&error_abort, "vfu: Failed to run device %s - %s",
>> +                           o->device, strerror(errno));
> 
> error_abort is equivalent to assuming !o->daemon. In the case where the
> user doesn't want to automatically shut down the process we need to log
> a message without aborting.

OK, makes sense.

> 
>> +                 break;
> 
> Indentation is off.
> 
>> +            }
>> +        }
>> +    }
>> +}
>> +
>> +static void vfu_object_attach_ctx(void *opaque)
>> +{
>> +    VfuObject *o = opaque;
>> +    GPollFD pfds[1];
>> +    int ret;
>> +
>> +    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
>> +
>> +    pfds[0].fd = o->vfu_poll_fd;
>> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
>> +
>> +retry_attach:
>> +    ret = vfu_attach_ctx(o->vfu_ctx);
>> +    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
>> +        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
>> +        goto retry_attach;
> 
> This can block the thread indefinitely. Other events like monitor
> commands are not handled in this loop. Please make this asynchronous
> (set an fd handler and return from this function so we can try again
> later).
> 
> The vfu_attach_ctx() implementation synchronously negotiates the
> vfio-user connection :(. That's a shame because even if accept(2) is
> handled asynchronously, the negotiation can still block. It would be
> cleanest to have a fully async libvfio-user's vfu_attach_ctx() API to
> avoid blocking. Is that possible?

Thanos / John,

    Any thoughts on this?

> 
> If libvfio-user can't make vfu_attach_ctx() fully async then it may be
> possible to spawn a thread just for the blocking vfu_attach_ctx() call
> and report the result back to the event loop (e.g. using EventNotifier).
> That adds a bunch of code to work around a blocking API though, so I
> guess we can leave the blocking part if necessary.
> 
> At the very minimum, please make EAGAIN/EWOULDBLOCK async as mentioned
> above and add a comment explaining the situation with the
> partially-async vfu_attach_ctx() API so it's clear that this can block
> (that way it's clear that you're aware of the issue and this isn't by
> accident).

Sure, we could make the attach async at QEMU depending on how the
library prefers to do this.

> 
>> +    } else if (ret < 0) {
>> +        error_setg(&error_abort,
>> +                   "vfu: Failed to attach device %s to context - %s",
>> +                   o->device, strerror(errno));
> 
> error_abort assumes !o->daemon. Please handle the o->daemon == true
> case by logging an error without aborting.
> 
>> +        return;
>> +    }
>> +
>> +    o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
>> +    if (o->vfu_poll_fd < 0) {
>> +        error_setg(&error_abort, "vfu: Failed to get poll fd %s", o->device);
> 
> Same here.
> 
>> @@ -208,6 +284,8 @@ static void vfu_object_init(Object *obj)
>>                    TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
>>         return;
>>     }
>> +
>> +    o->vfu_poll_fd = -1;
>> }
> 
> This must call qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL)
> when o->vfu_poll_fd != -1 to avoid leaving a dangling fd handler
> callback registered.

This is during the init phase, and the FD handlers are not set. Do you mean
to add this at finalize?

I agree it’s good to explicitly add this at finalize. But vfu_destroy_ctx() should
trigger a ENOTCONN, which would do it anyway.

Thank you!
--
Jag
Stefan Hajnoczi Dec. 20, 2021, 8:29 a.m. UTC | #3
On Fri, Dec 17, 2021 at 05:59:48PM +0000, Jag Raman wrote:
> 
> 
> > On Dec 16, 2021, at 6:17 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Wed, Dec 15, 2021 at 10:35:31AM -0500, Jagannathan Raman wrote:
> >> @@ -114,6 +118,62 @@ static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
> >>     vfu_object_init_ctx(o, errp);
> >> }
> >> 
> >> +static void vfu_object_ctx_run(void *opaque)
> >> +{
> >> +    VfuObject *o = opaque;
> >> +    int ret = -1;
> >> +
> >> +    while (ret != 0) {
> >> +        ret = vfu_run_ctx(o->vfu_ctx);
> >> +        if (ret < 0) {
> >> +            if (errno == EINTR) {
> >> +                continue;
> >> +            } else if (errno == ENOTCONN) {
> >> +                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> >> +                o->vfu_poll_fd = -1;
> >> +                object_unparent(OBJECT(o));
> >> +                break;
> > 
> > If nothing else logs a message then I think that should be done here so
> > users know why their vfio-user server object disappeared.
> 
> Sure will do.
> 
> Do you prefer a trace, or a message to the console? Trace makes sense to me.
> Presently, the client could unplug the vfio-user device which would trigger the
> deletion of this object. This process could happen quietly.

If there is no way to differentiate graceful disconnect from unexpected
disconnect then logging might be too noisy.

Regarding the automatic deletion of the object, that might not be
desirable for two reasons:
1. It prevents reconnection or another client connecting.
2. Management tools are in the dark about it.

For #2 there are monitor events that QEMU emits to notify management
tools about state changes like disconnections.

It's worth thinking about current and future use cases before baking in
a policy like automatically deleting VfuObject on disconnect because
it's inflexible and would require a QEMU update in the future to support
a different policy.

One approach is to emit a disconnect event but leave the VfuObject in a
disconnected state. The management tool can then restart or clean up,
depending on its policy.

I'm not sure what's best because it depends on the use cases, but maybe
you and others have ideas here.

> >> @@ -208,6 +284,8 @@ static void vfu_object_init(Object *obj)
> >>                    TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
> >>         return;
> >>     }
> >> +
> >> +    o->vfu_poll_fd = -1;
> >> }
> > 
> > This must call qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL)
> > when o->vfu_poll_fd != -1 to avoid leaving a dangling fd handler
> > callback registered.
> 
> This is during the init phase, and the FD handlers are not set. Do you mean
> to add this at finalize?
> 
> I agree it’s good to explicitly add this at finalize. But vfu_destroy_ctx() should
> trigger a ENOTCONN, which would do it anyway.

I'm not sure my comment makes sense since this is the init function, not
finalize.

However, it's not clear to me that the o->vfu_poll_fd fd handler is
unregistered from the event loop when VfuObject is finalized (e.g. by
the object-del monitor command). You say vfu_destroy_ctx() triggers
ENOTCONN, but the VfuObject is freed after finalize returns so when is
the fd handler deregistered?

Stefan
Jag Raman Dec. 21, 2021, 3:04 a.m. UTC | #4
> On Dec 20, 2021, at 3:29 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Fri, Dec 17, 2021 at 05:59:48PM +0000, Jag Raman wrote:
>> 
>> 
>>> On Dec 16, 2021, at 6:17 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>> On Wed, Dec 15, 2021 at 10:35:31AM -0500, Jagannathan Raman wrote:
>>>> @@ -114,6 +118,62 @@ static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
>>>>    vfu_object_init_ctx(o, errp);
>>>> }
>>>> 
>>>> +static void vfu_object_ctx_run(void *opaque)
>>>> +{
>>>> +    VfuObject *o = opaque;
>>>> +    int ret = -1;
>>>> +
>>>> +    while (ret != 0) {
>>>> +        ret = vfu_run_ctx(o->vfu_ctx);
>>>> +        if (ret < 0) {
>>>> +            if (errno == EINTR) {
>>>> +                continue;
>>>> +            } else if (errno == ENOTCONN) {
>>>> +                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
>>>> +                o->vfu_poll_fd = -1;
>>>> +                object_unparent(OBJECT(o));
>>>> +                break;
>>> 
>>> If nothing else logs a message then I think that should be done here so
>>> users know why their vfio-user server object disappeared.
>> 
>> Sure will do.
>> 
>> Do you prefer a trace, or a message to the console? Trace makes sense to me.
>> Presently, the client could unplug the vfio-user device which would trigger the
>> deletion of this object. This process could happen quietly.
> 
> If there is no way to differentiate graceful disconnect from unexpected
> disconnect then logging might be too noisy.

I think that’s what happens in the regular VFIO case also.
vfio_put_base_device() closes the FD used for ioctls.

> 
> Regarding the automatic deletion of the object, that might not be
> desirable for two reasons:
> 1. It prevents reconnection or another client connecting.
> 2. Management tools are in the dark about it.
> 
> For #2 there are monitor events that QEMU emits to notify management
> tools about state changes like disconnections.

This is very interesting. I suppose you’re referring to something like
‘BLOCK_JOB_COMPLETED’ event.

It’d be good to inform the management tools about disconnection. Not
used this before, will check it out to gather ideas on how to use it.

> 
> It's worth thinking about current and future use cases before baking in
> a policy like automatically deleting VfuObject on disconnect because
> it's inflexible and would require a QEMU update in the future to support
> a different policy.
> 
> One approach is to emit a disconnect event but leave the VfuObject in a
> disconnected state. The management tool can then restart or clean up,
> depending on its policy.
> 
> I'm not sure what's best because it depends on the use cases, but maybe
> you and others have ideas here.
> 
>>>> @@ -208,6 +284,8 @@ static void vfu_object_init(Object *obj)
>>>>                   TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
>>>>        return;
>>>>    }
>>>> +
>>>> +    o->vfu_poll_fd = -1;
>>>> }
>>> 
>>> This must call qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL)
>>> when o->vfu_poll_fd != -1 to avoid leaving a dangling fd handler
>>> callback registered.
>> 
>> This is during the init phase, and the FD handlers are not set. Do you mean
>> to add this at finalize?
>> 
>> I agree it’s good to explicitly add this at finalize. But vfu_destroy_ctx() should
>> trigger a ENOTCONN, which would do it anyway.
> 
> I'm not sure my comment makes sense since this is the init function, not
> finalize.
> 
> However, it's not clear to me that the o->vfu_poll_fd fd handler is
> unregistered from the event loop when VfuObject is finalized (e.g. by
> the object-del monitor command). You say vfu_destroy_ctx() triggers
> ENOTCONN, but the VfuObject is freed after finalize returns so when is
> the fd handler deregistered?

That is correct - will remove the FD handler in finalize also.

Thank you!
--
Jag

> 
> Stefan
Thanos Makatos Jan. 5, 2022, 10:38 a.m. UTC | #5
> -----Original Message-----
> From: Jag Raman <jag.raman@oracle.com>
> Sent: 17 December 2021 18:00
> To: Stefan Hajnoczi <stefanha@redhat.com>; John Levon
> <john.levon@nutanix.com>; Thanos Makatos <thanos.makatos@nutanix.com>
> Cc: qemu-devel <qemu-devel@nongnu.org>; Alex Williamson
> <alex.williamson@redhat.com>; Marc-André Lureau
> <marcandre.lureau@gmail.com>; Philippe Mathieu-Daudé
> <philmd@redhat.com>; pbonzini@redhat.com; alex.bennee@linaro.org;
> thuth@redhat.com; crosa@redhat.com; wainersm@redhat.com;
> bleal@redhat.com; Elena Ufimtseva <elena.ufimtseva@oracle.com>; John
> Levon <john.levon@nutanix.com>; John Johnson
> <john.g.johnson@oracle.com>; Thanos Makatos
> <thanos.makatos@nutanix.com>; Swapnil Ingle <swapnil.ingle@nutanix.com>
> Subject: Re: [PATCH v4 07/14] vfio-user: run vfio-user context
> 
> 
> 
> > On Dec 16, 2021, at 6:17 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >
> > On Wed, Dec 15, 2021 at 10:35:31AM -0500, Jagannathan Raman wrote:
> >> @@ -114,6 +118,62 @@ static void vfu_object_set_device(Object *obj,
> const char *str, Error **errp)
> >>     vfu_object_init_ctx(o, errp);
> >> }
> >>
> >> +static void vfu_object_ctx_run(void *opaque)
> >> +{
> >> +    VfuObject *o = opaque;
> >> +    int ret = -1;
> >> +
> >> +    while (ret != 0) {
> >> +        ret = vfu_run_ctx(o->vfu_ctx);
> >> +        if (ret < 0) {
> >> +            if (errno == EINTR) {
> >> +                continue;
> >> +            } else if (errno == ENOTCONN) {
> >> +                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> >> +                o->vfu_poll_fd = -1;
> >> +                object_unparent(OBJECT(o));
> >> +                break;
> >
> > If nothing else logs a message then I think that should be done here so
> > users know why their vfio-user server object disappeared.
> 
> Sure will do.
> 
> Do you prefer a trace, or a message to the console? Trace makes sense to me.
> Presently, the client could unplug the vfio-user device which would trigger the
> deletion of this object. This process could happen quietly.
> 
> >
> >> +            } else {
> >> +                error_setg(&error_abort, "vfu: Failed to run device %s - %s",
> >> +                           o->device, strerror(errno));
> >
> > error_abort is equivalent to assuming !o->daemon. In the case where the
> > user doesn't want to automatically shut down the process we need to log
> > a message without aborting.
> 
> OK, makes sense.
> 
> >
> >> +                 break;
> >
> > Indentation is off.
> >
> >> +            }
> >> +        }
> >> +    }
> >> +}
> >> +
> >> +static void vfu_object_attach_ctx(void *opaque)
> >> +{
> >> +    VfuObject *o = opaque;
> >> +    GPollFD pfds[1];
> >> +    int ret;
> >> +
> >> +    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> >> +
> >> +    pfds[0].fd = o->vfu_poll_fd;
> >> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
> >> +
> >> +retry_attach:
> >> +    ret = vfu_attach_ctx(o->vfu_ctx);
> >> +    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
> >> +        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
> >> +        goto retry_attach;
> >
> > This can block the thread indefinitely. Other events like monitor
> > commands are not handled in this loop. Please make this asynchronous
> > (set an fd handler and return from this function so we can try again
> > later).
> >
> > The vfu_attach_ctx() implementation synchronously negotiates the
> > vfio-user connection :(. That's a shame because even if accept(2) is
> > handled asynchronously, the negotiation can still block. It would be
> > cleanest to have a fully async libvfio-user's vfu_attach_ctx() API to
> > avoid blocking. Is that possible?
> 
> Thanos / John,
> 
>     Any thoughts on this?

I'm discussing this with John and FYI there are other places where libvfio-user can block, e.g. sending a response or receiving a command. Is it just the negotiation you want it to be asynchronous or _all_ libvfio-user operations? Making libvfio-user fully asynchronous might require a substantial API rewrite.

> 
> >
> > If libvfio-user can't make vfu_attach_ctx() fully async then it may be
> > possible to spawn a thread just for the blocking vfu_attach_ctx() call
> > and report the result back to the event loop (e.g. using EventNotifier).
> > That adds a bunch of code to work around a blocking API though, so I
> > guess we can leave the blocking part if necessary.
> >
> > At the very minimum, please make EAGAIN/EWOULDBLOCK async as
> mentioned
> > above and add a comment explaining the situation with the
> > partially-async vfu_attach_ctx() API so it's clear that this can block
> > (that way it's clear that you're aware of the issue and this isn't by
> > accident).
> 
> Sure, we could make the attach async at QEMU depending on how the
> library prefers to do this.
> 
> >
> >> +    } else if (ret < 0) {
> >> +        error_setg(&error_abort,
> >> +                   "vfu: Failed to attach device %s to context - %s",
> >> +                   o->device, strerror(errno));
> >
> > error_abort assumes !o->daemon. Please handle the o->daemon == true
> > case by logging an error without aborting.
> >
> >> +        return;
> >> +    }
> >> +
> >> +    o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
> >> +    if (o->vfu_poll_fd < 0) {
> >> +        error_setg(&error_abort, "vfu: Failed to get poll fd %s", o->device);
> >
> > Same here.
> >
> >> @@ -208,6 +284,8 @@ static void vfu_object_init(Object *obj)
> >>                    TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
> >>         return;
> >>     }
> >> +
> >> +    o->vfu_poll_fd = -1;
> >> }
> >
> > This must call qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL)
> > when o->vfu_poll_fd != -1 to avoid leaving a dangling fd handler
> > callback registered.
> 
> This is during the init phase, and the FD handlers are not set. Do you mean
> to add this at finalize?
> 
> I agree it’s good to explicitly add this at finalize. But vfu_destroy_ctx() should
> trigger a ENOTCONN, which would do it anyway.
> 
> Thank you!
> --
> Jag
Stefan Hajnoczi Jan. 6, 2022, 1:35 p.m. UTC | #6
On Wed, Jan 05, 2022 at 10:38:10AM +0000, Thanos Makatos wrote:
> 
> 
> > -----Original Message-----
> > From: Jag Raman <jag.raman@oracle.com>
> > Sent: 17 December 2021 18:00
> > To: Stefan Hajnoczi <stefanha@redhat.com>; John Levon
> > <john.levon@nutanix.com>; Thanos Makatos <thanos.makatos@nutanix.com>
> > Cc: qemu-devel <qemu-devel@nongnu.org>; Alex Williamson
> > <alex.williamson@redhat.com>; Marc-André Lureau
> > <marcandre.lureau@gmail.com>; Philippe Mathieu-Daudé
> > <philmd@redhat.com>; pbonzini@redhat.com; alex.bennee@linaro.org;
> > thuth@redhat.com; crosa@redhat.com; wainersm@redhat.com;
> > bleal@redhat.com; Elena Ufimtseva <elena.ufimtseva@oracle.com>; John
> > Levon <john.levon@nutanix.com>; John Johnson
> > <john.g.johnson@oracle.com>; Thanos Makatos
> > <thanos.makatos@nutanix.com>; Swapnil Ingle <swapnil.ingle@nutanix.com>
> > Subject: Re: [PATCH v4 07/14] vfio-user: run vfio-user context
> > 
> > 
> > 
> > > On Dec 16, 2021, at 6:17 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >
> > > On Wed, Dec 15, 2021 at 10:35:31AM -0500, Jagannathan Raman wrote:
> > >> @@ -114,6 +118,62 @@ static void vfu_object_set_device(Object *obj,
> > const char *str, Error **errp)
> > >>     vfu_object_init_ctx(o, errp);
> > >> }
> > >>
> > >> +static void vfu_object_ctx_run(void *opaque)
> > >> +{
> > >> +    VfuObject *o = opaque;
> > >> +    int ret = -1;
> > >> +
> > >> +    while (ret != 0) {
> > >> +        ret = vfu_run_ctx(o->vfu_ctx);
> > >> +        if (ret < 0) {
> > >> +            if (errno == EINTR) {
> > >> +                continue;
> > >> +            } else if (errno == ENOTCONN) {
> > >> +                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> > >> +                o->vfu_poll_fd = -1;
> > >> +                object_unparent(OBJECT(o));
> > >> +                break;
> > >
> > > If nothing else logs a message then I think that should be done here so
> > > users know why their vfio-user server object disappeared.
> > 
> > Sure will do.
> > 
> > Do you prefer a trace, or a message to the console? Trace makes sense to me.
> > Presently, the client could unplug the vfio-user device which would trigger the
> > deletion of this object. This process could happen quietly.
> > 
> > >
> > >> +            } else {
> > >> +                error_setg(&error_abort, "vfu: Failed to run device %s - %s",
> > >> +                           o->device, strerror(errno));
> > >
> > > error_abort is equivalent to assuming !o->daemon. In the case where the
> > > user doesn't want to automatically shut down the process we need to log
> > > a message without aborting.
> > 
> > OK, makes sense.
> > 
> > >
> > >> +                 break;
> > >
> > > Indentation is off.
> > >
> > >> +            }
> > >> +        }
> > >> +    }
> > >> +}
> > >> +
> > >> +static void vfu_object_attach_ctx(void *opaque)
> > >> +{
> > >> +    VfuObject *o = opaque;
> > >> +    GPollFD pfds[1];
> > >> +    int ret;
> > >> +
> > >> +    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> > >> +
> > >> +    pfds[0].fd = o->vfu_poll_fd;
> > >> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
> > >> +
> > >> +retry_attach:
> > >> +    ret = vfu_attach_ctx(o->vfu_ctx);
> > >> +    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
> > >> +        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
> > >> +        goto retry_attach;
> > >
> > > This can block the thread indefinitely. Other events like monitor
> > > commands are not handled in this loop. Please make this asynchronous
> > > (set an fd handler and return from this function so we can try again
> > > later).
> > >
> > > The vfu_attach_ctx() implementation synchronously negotiates the
> > > vfio-user connection :(. That's a shame because even if accept(2) is
> > > handled asynchronously, the negotiation can still block. It would be
> > > cleanest to have a fully async libvfio-user's vfu_attach_ctx() API to
> > > avoid blocking. Is that possible?
> > 
> > Thanos / John,
> > 
> >     Any thoughts on this?
> 
> I'm discussing this with John and FYI there are other places where libvfio-user can block, e.g. sending a response or receiving a command. Is it just the negotiation you want it to be asynchronous or _all_ libvfio-user operations? Making libvfio-user fully asynchronous might require a substantial API rewrite.

I see at least two reasons for a fully async API:

1. The program wants to handle other events (e.g. a management REST API)
   from the same event loop thread that invokes libvfio-user. If
   libvfio-user blocks then the other events cannot be handled within a
   reasonable time frame.

   The workaround for this is to use multi-threading and ignore the
   event-driven architecture implied by vfu_get_poll_fd().

2. The program handles multiple clients that do not trust each other.
   This could be a software-defined network switch or storage appliance.
   A malicious client can cause a denial-of-service by making a
   libvfio-user call block.

   Again, the program needs separate threads instead of an event loop to
   work around this.

The downside to a sync approach is that programs that already have an
event loop require extra code to set up dedicated threads for
libvfio-user. That's a library integration/usability issue.

In some cases it's okay to block: when the program doesn't need to
handle other events. If most users of libvfio-user are expected to fall
into this category then there's no need to change the API.

Either way, the doc comments in libvfio-user.h aren't very clear.
Someone integrating this library may think vfu_get_poll_fd() allows for
fully async operation.

Stefan
John Levon Jan. 10, 2022, 5:56 p.m. UTC | #7
On Thu, Jan 06, 2022 at 01:35:32PM +0000, Stefan Hajnoczi wrote:

> > > >> +static void vfu_object_attach_ctx(void *opaque)
> > > >> +{
> > > >> +    VfuObject *o = opaque;
> > > >> +    GPollFD pfds[1];
> > > >> +    int ret;
> > > >> +
> > > >> +    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> > > >> +
> > > >> +    pfds[0].fd = o->vfu_poll_fd;
> > > >> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
> > > >> +
> > > >> +retry_attach:
> > > >> +    ret = vfu_attach_ctx(o->vfu_ctx);
> > > >> +    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
> > > >> +        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
> > > >> +        goto retry_attach;
> > > >
> > > > This can block the thread indefinitely. Other events like monitor
> > > > commands are not handled in this loop. Please make this asynchronous
> > > > (set an fd handler and return from this function so we can try again
> > > > later).
> > > >
> > > > The vfu_attach_ctx() implementation synchronously negotiates the
> > > > vfio-user connection :(. That's a shame because even if accept(2) is
> > > > handled asynchronously, the negotiation can still block. It would be
> > > > cleanest to have a fully async libvfio-user's vfu_attach_ctx() API to
> > > > avoid blocking. Is that possible?
> > > 
> > > Thanos / John,
> > > 
> > >     Any thoughts on this?
> > 
> > I'm discussing this with John and FYI there are other places where libvfio-user can block, e.g. sending a response or receiving a command. Is it just the negotiation you want it to be asynchronous or _all_ libvfio-user operations? Making libvfio-user fully asynchronous might require a substantial API rewrite.
> 
> I see at least two reasons for a fully async API:
> 
> 1. The program wants to handle other events (e.g. a management REST API)
>    from the same event loop thread that invokes libvfio-user. If
>    libvfio-user blocks then the other events cannot be handled within a
>    reasonable time frame.
> 
>    The workaround for this is to use multi-threading and ignore the
>    event-driven architecture implied by vfu_get_poll_fd().
> 
> 2. The program handles multiple clients that do not trust each other.
>    This could be a software-defined network switch or storage appliance.
>    A malicious client can cause a denial-of-service by making a
>    libvfio-user call block.
> 
>    Again, the program needs separate threads instead of an event loop to
>    work around this.

Hi Stefan, we're certainly aware of the rationale. Ultimately we just haven't
yet found resources to fix this: in practice, we don't really hit problems from
the parts that are still synchronous. Of course, that's not a good argument when
it comes to your second issue, and indeed the library is not currently suitable
for multiple untrusted clients.

For Jag but: patches are welcome. But it's not just negotiate(): all sorts of
things are potentially synchronous - for example, it's expected that the region
read/write callbacks are done synchronously. Any other client reply, or
server->client message, is also synchronous.

It's partly why we haven't yet stabilised the API.

regards
john
Stefan Hajnoczi Jan. 11, 2022, 9:36 a.m. UTC | #8
On Mon, Jan 10, 2022 at 05:56:25PM +0000, John Levon wrote:
> On Thu, Jan 06, 2022 at 01:35:32PM +0000, Stefan Hajnoczi wrote:
> 
> > > > >> +static void vfu_object_attach_ctx(void *opaque)
> > > > >> +{
> > > > >> +    VfuObject *o = opaque;
> > > > >> +    GPollFD pfds[1];
> > > > >> +    int ret;
> > > > >> +
> > > > >> +    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
> > > > >> +
> > > > >> +    pfds[0].fd = o->vfu_poll_fd;
> > > > >> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
> > > > >> +
> > > > >> +retry_attach:
> > > > >> +    ret = vfu_attach_ctx(o->vfu_ctx);
> > > > >> +    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
> > > > >> +        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
> > > > >> +        goto retry_attach;
> > > > >
> > > > > This can block the thread indefinitely. Other events like monitor
> > > > > commands are not handled in this loop. Please make this asynchronous
> > > > > (set an fd handler and return from this function so we can try again
> > > > > later).
> > > > >
> > > > > The vfu_attach_ctx() implementation synchronously negotiates the
> > > > > vfio-user connection :(. That's a shame because even if accept(2) is
> > > > > handled asynchronously, the negotiation can still block. It would be
> > > > > cleanest to have a fully async libvfio-user's vfu_attach_ctx() API to
> > > > > avoid blocking. Is that possible?
> > > > 
> > > > Thanos / John,
> > > > 
> > > >     Any thoughts on this?
> > > 
> > > I'm discussing this with John and FYI there are other places where libvfio-user can block, e.g. sending a response or receiving a command. Is it just the negotiation you want it to be asynchronous or _all_ libvfio-user operations? Making libvfio-user fully asynchronous might require a substantial API rewrite.
> > 
> > I see at least two reasons for a fully async API:
> > 
> > 1. The program wants to handle other events (e.g. a management REST API)
> >    from the same event loop thread that invokes libvfio-user. If
> >    libvfio-user blocks then the other events cannot be handled within a
> >    reasonable time frame.
> > 
> >    The workaround for this is to use multi-threading and ignore the
> >    event-driven architecture implied by vfu_get_poll_fd().
> > 
> > 2. The program handles multiple clients that do not trust each other.
> >    This could be a software-defined network switch or storage appliance.
> >    A malicious client can cause a denial-of-service by making a
> >    libvfio-user call block.
> > 
> >    Again, the program needs separate threads instead of an event loop to
> >    work around this.
> 
> Hi Stefan, we're certainly aware of the rationale. Ultimately we just haven't
> yet found resources to fix this: in practice, we don't really hit problems from
> the parts that are still synchronous. Of course, that's not a good argument when
> it comes to your second issue, and indeed the library is not currently suitable
> for multiple untrusted clients.
> 
> For Jag but: patches are welcome. But it's not just negotiate(): all sorts of
> things are potentially synchronous - for example, it's expected that the region
> read/write callbacks are done synchronously. Any other client reply, or
> server->client message, is also synchronous.
> 
> It's partly why we haven't yet stabilised the API.

From the QEMU side it's okay to have blocking code in this experimental
feature if users are aware of the limitation (e.g. the monitor is
unresponsive if vfio-user blocks) and multiple untrusted clients are not
supported. The QEMU code should also make its limitations clear so
readers are aware of them and know the author chose this approach
intentionally rather than due to an oversight.

Jag: Please mention this in the documentation and add a comment to
vfu_object_attach_ctx() explaining that this code currently blocks.

I think in the long run it will be necessary to address it, e.g. in the
multi-client case. That can either be done with threads or by making
libvfio-user fully async.

Stefan
Jag Raman Jan. 11, 2022, 1:12 p.m. UTC | #9
> On Jan 11, 2022, at 4:36 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Mon, Jan 10, 2022 at 05:56:25PM +0000, John Levon wrote:
>> On Thu, Jan 06, 2022 at 01:35:32PM +0000, Stefan Hajnoczi wrote:
>> 
>>>>>>> +static void vfu_object_attach_ctx(void *opaque)
>>>>>>> +{
>>>>>>> +    VfuObject *o = opaque;
>>>>>>> +    GPollFD pfds[1];
>>>>>>> +    int ret;
>>>>>>> +
>>>>>>> +    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
>>>>>>> +
>>>>>>> +    pfds[0].fd = o->vfu_poll_fd;
>>>>>>> +    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
>>>>>>> +
>>>>>>> +retry_attach:
>>>>>>> +    ret = vfu_attach_ctx(o->vfu_ctx);
>>>>>>> +    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
>>>>>>> +        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
>>>>>>> +        goto retry_attach;
>>>>>> 
>>>>>> This can block the thread indefinitely. Other events like monitor
>>>>>> commands are not handled in this loop. Please make this asynchronous
>>>>>> (set an fd handler and return from this function so we can try again
>>>>>> later).
>>>>>> 
>>>>>> The vfu_attach_ctx() implementation synchronously negotiates the
>>>>>> vfio-user connection :(. That's a shame because even if accept(2) is
>>>>>> handled asynchronously, the negotiation can still block. It would be
>>>>>> cleanest to have a fully async libvfio-user's vfu_attach_ctx() API to
>>>>>> avoid blocking. Is that possible?
>>>>> 
>>>>> Thanos / John,
>>>>> 
>>>>>    Any thoughts on this?
>>>> 
>>>> I'm discussing this with John and FYI there are other places where libvfio-user can block, e.g. sending a response or receiving a command. Is it just the negotiation you want it to be asynchronous or _all_ libvfio-user operations? Making libvfio-user fully asynchronous might require a substantial API rewrite.
>>> 
>>> I see at least two reasons for a fully async API:
>>> 
>>> 1. The program wants to handle other events (e.g. a management REST API)
>>>   from the same event loop thread that invokes libvfio-user. If
>>>   libvfio-user blocks then the other events cannot be handled within a
>>>   reasonable time frame.
>>> 
>>>   The workaround for this is to use multi-threading and ignore the
>>>   event-driven architecture implied by vfu_get_poll_fd().
>>> 
>>> 2. The program handles multiple clients that do not trust each other.
>>>   This could be a software-defined network switch or storage appliance.
>>>   A malicious client can cause a denial-of-service by making a
>>>   libvfio-user call block.
>>> 
>>>   Again, the program needs separate threads instead of an event loop to
>>>   work around this.
>> 
>> Hi Stefan, we're certainly aware of the rationale. Ultimately we just haven't
>> yet found resources to fix this: in practice, we don't really hit problems from
>> the parts that are still synchronous. Of course, that's not a good argument when
>> it comes to your second issue, and indeed the library is not currently suitable
>> for multiple untrusted clients.
>> 
>> For Jag but: patches are welcome. But it's not just negotiate(): all sorts of
>> things are potentially synchronous - for example, it's expected that the region
>> read/write callbacks are done synchronously. Any other client reply, or
>> server->client message, is also synchronous.
>> 
>> It's partly why we haven't yet stabilised the API.
> 
> From the QEMU side it's okay to have blocking code in this experimental
> feature if users are aware of the limitation (e.g. the monitor is
> unresponsive if vfio-user blocks) and multiple untrusted clients are not
> supported. The QEMU code should also make its limitations clear so
> readers are aware of them and know the author chose this approach
> intentionally rather than due to an oversight.
> 
> Jag: Please mention this in the documentation and add a comment to
> vfu_object_attach_ctx() explaining that this code currently blocks.

Sure, will do.

Thank you!
--
Jag

> 
> I think in the long run it will be necessary to address it, e.g. in the
> multi-client case. That can either be done with threads or by making
> libvfio-user fully async.
> 
> Stefan
diff mbox series

Patch

diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index bcbea59bf1..a01a0ad185 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -42,10 +42,12 @@ 
 #include "qapi/error.h"
 #include "qapi/qapi-visit-sockets.h"
 #include "qemu/notify.h"
+#include "qemu/thread.h"
 #include "sysemu/sysemu.h"
 #include "libvfio-user.h"
 #include "hw/qdev-core.h"
 #include "hw/pci/pci.h"
+#include "qemu/timer.h"
 
 #define TYPE_VFU_OBJECT "x-vfio-user-server"
 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -73,6 +75,8 @@  struct VfuObject {
     vfu_ctx_t *vfu_ctx;
 
     PCIDevice *pci_dev;
+
+    int vfu_poll_fd;
 };
 
 static void vfu_object_init_ctx(VfuObject *o, Error **errp);
@@ -114,6 +118,62 @@  static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
     vfu_object_init_ctx(o, errp);
 }
 
+static void vfu_object_ctx_run(void *opaque)
+{
+    VfuObject *o = opaque;
+    int ret = -1;
+
+    while (ret != 0) {
+        ret = vfu_run_ctx(o->vfu_ctx);
+        if (ret < 0) {
+            if (errno == EINTR) {
+                continue;
+            } else if (errno == ENOTCONN) {
+                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+                o->vfu_poll_fd = -1;
+                object_unparent(OBJECT(o));
+                break;
+            } else {
+                error_setg(&error_abort, "vfu: Failed to run device %s - %s",
+                           o->device, strerror(errno));
+                 break;
+            }
+        }
+    }
+}
+
+static void vfu_object_attach_ctx(void *opaque)
+{
+    VfuObject *o = opaque;
+    GPollFD pfds[1];
+    int ret;
+
+    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+
+    pfds[0].fd = o->vfu_poll_fd;
+    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
+
+retry_attach:
+    ret = vfu_attach_ctx(o->vfu_ctx);
+    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+        qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
+        goto retry_attach;
+    } else if (ret < 0) {
+        error_setg(&error_abort,
+                   "vfu: Failed to attach device %s to context - %s",
+                   o->device, strerror(errno));
+        return;
+    }
+
+    o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
+    if (o->vfu_poll_fd < 0) {
+        error_setg(&error_abort, "vfu: Failed to get poll fd %s", o->device);
+        return;
+    }
+
+    qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
+}
+
 /*
  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
  * properties. It also depends on devices instantiated in QEMU. These
@@ -151,7 +211,8 @@  static void vfu_object_init_ctx(VfuObject *o, Error **errp)
         return;
     }
 
-    o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path, 0,
+    o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
+                                LIBVFIO_USER_FLAG_ATTACH_NB,
                                 o, VFU_DEV_TYPE_PCI);
     if (o->vfu_ctx == NULL) {
         error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
@@ -183,6 +244,21 @@  static void vfu_object_init_ctx(VfuObject *o, Error **errp)
         goto fail;
     }
 
+    ret = vfu_realize_ctx(o->vfu_ctx);
+    if (ret < 0) {
+        error_setg(errp, "vfu: Failed to realize device %s- %s",
+                   o->device, strerror(errno));
+        goto fail;
+    }
+
+    o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
+    if (o->vfu_poll_fd < 0) {
+        error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
+        goto fail;
+    }
+
+    qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
+
     return;
 
 fail:
@@ -208,6 +284,8 @@  static void vfu_object_init(Object *obj)
                    TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
         return;
     }
+
+    o->vfu_poll_fd = -1;
 }
 
 static void vfu_object_finalize(Object *obj)