Message ID | 93b7566e5d565b9e5d8127849bb5be65057e25cc.1596217462.git.jag.raman@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Initial support for multi-process qemu | expand |
On Fri, Jul 31, 2020 at 02:20:24PM -0400, Jagannathan Raman wrote: > @@ -343,3 +349,49 @@ static void probe_pci_info(PCIDevice *dev, Error **errp) > } > } > } > + > +static void hb_msg(PCIProxyDev *dev) > +{ > + DeviceState *ds = DEVICE(dev); > + Error *local_err = NULL; > + MPQemuMsg msg = { 0 }; > + > + msg.cmd = PROXY_PING; > + msg.bytestream = 0; > + msg.size = 0; > + > + (void)mpqemu_msg_send_and_await_reply(&msg, dev->ioc, &local_err); > + if (local_err) { > + error_report_err(local_err); > + qio_channel_close(dev->ioc, &local_err); > + error_setg(&error_fatal, "Lost contact with device %s", ds->id); > + } > +} Here is my feedback from the last revision. Was this addressed? This patch seems incomplete since no action is taken when the device fails to respond. vCPU threads that access the device will still get stuck. The simplest way to make this useful is to close the connection when a timeout occurs. Then the G_IO_HUP handler for the UNIX domain socket should perform connection cleanup. At that point there are a few choices: 1. Stop guest execution and wait for the host admin to restore the mplink so execution can resume. This is similar to how -drive rerror=stop pauses the guest when a disk I/O error is encountered. 2. Stop guest execution but defer it until this stale device is actually accessed. This maximizes guest uptime. Guests that rarely access the device may not notice at all. 3. Return 0 from MemoryRegion read operations and ignore writes. The guest continues executing but the device is broken. This is risky because device drivers inside the guest may not be ready to deal with this. The result could be data loss or corruption. 4. Raise a bus-level event. Maybe PCI error reporting can be used to offline the device. 5. Terminate the guest with an error message. 6. ? Until the heartbeat is fully implemented and tested I suggest dropping it from this patch series. Remember the G_IO_HUP will happen anyway if the remote device process terminates.
On Tue, Aug 11, 2020 at 03:41:30PM +0100, Stefan Hajnoczi wrote: > On Fri, Jul 31, 2020 at 02:20:24PM -0400, Jagannathan Raman wrote: > > @@ -343,3 +349,49 @@ static void probe_pci_info(PCIDevice *dev, Error **errp) > > } > > } > > } > > + > > +static void hb_msg(PCIProxyDev *dev) > > +{ > > + DeviceState *ds = DEVICE(dev); > > + Error *local_err = NULL; > > + MPQemuMsg msg = { 0 }; > > + > > + msg.cmd = PROXY_PING; > > + msg.bytestream = 0; > > + msg.size = 0; > > + > > + (void)mpqemu_msg_send_and_await_reply(&msg, dev->ioc, &local_err); > > + if (local_err) { > > + error_report_err(local_err); > > + qio_channel_close(dev->ioc, &local_err); > > + error_setg(&error_fatal, "Lost contact with device %s", ds->id); > > + } > > +} > > Here is my feedback from the last revision. Was this addressed? > Hi Stefan, Thank you for reviewing the patchset. In this version we decided to shutdown the guest when the heartbeat did not get a reply from the remote by setting the error_fatal. Should we approach it differently or you prefer us to get rid of the heartbeat in this form? Thank you, Elena > This patch seems incomplete since no action is taken when the device > fails to respond. vCPU threads that access the device will still get > stuck. > > The simplest way to make this useful is to close the connection when a > timeout occurs. Then the G_IO_HUP handler for the UNIX domain socket > should perform connection cleanup. At that point there are a few > choices: > > 1. Stop guest execution and wait for the host admin to restore the > mplink so execution can resume. This is similar to how -drive > rerror=stop pauses the guest when a disk I/O error is encountered. > > 2. Stop guest execution but defer it until this stale device is actually > accessed. This maximizes guest uptime. Guests that rarely access the > device may not notice at all. > > 3. Return 0 from MemoryRegion read operations and ignore writes. The > guest continues executing but the device is broken. This is risky > because device drivers inside the guest may not be ready to deal with > this. The result could be data loss or corruption. > > 4. Raise a bus-level event. Maybe PCI error reporting can be used to > offline the device. > > 5. Terminate the guest with an error message. > > 6. ? > > Until the heartbeat is fully implemented and tested I suggest dropping > it from this patch series. Remember the G_IO_HUP will happen anyway if > the remote device process terminates.
On Fri, Aug 14, 2020 at 04:01:47PM -0700, Elena Ufimtseva wrote: > On Tue, Aug 11, 2020 at 03:41:30PM +0100, Stefan Hajnoczi wrote: > > On Fri, Jul 31, 2020 at 02:20:24PM -0400, Jagannathan Raman wrote: > > > @@ -343,3 +349,49 @@ static void probe_pci_info(PCIDevice *dev, Error **errp) > > > } > > > } > > > } > > > + > > > +static void hb_msg(PCIProxyDev *dev) > > > +{ > > > + DeviceState *ds = DEVICE(dev); > > > + Error *local_err = NULL; > > > + MPQemuMsg msg = { 0 }; > > > + > > > + msg.cmd = PROXY_PING; > > > + msg.bytestream = 0; > > > + msg.size = 0; > > > + > > > + (void)mpqemu_msg_send_and_await_reply(&msg, dev->ioc, &local_err); > > > + if (local_err) { > > > + error_report_err(local_err); > > > + qio_channel_close(dev->ioc, &local_err); > > > + error_setg(&error_fatal, "Lost contact with device %s", ds->id); > > > + } > > > +} > > > > Here is my feedback from the last revision. Was this addressed? > > > > Hi Stefan, > > Thank you for reviewing the patchset. In this version we decided to > shutdown the guest when the heartbeat did not get a reply from the > remote by setting the error_fatal. > Should we approach it differently or you prefer us to get rid of the > heartbeat in this form? I think the only case that this patch handles is when the mpqemu channel is closed. The VM hangs when the channel is still open but the remote is unresponsive. (mpqemu_msg_send_and_await_reply() calls aio_poll() with the global mutex held so vcpus cannot make progress.) The heartbeat mechanism needs to handle the case where the other side isn't responding. It can't hang QEMU. I suggest dropping this patch. It can be done later. Stefan
diff --git a/hw/i386/remote-msg.c b/hw/i386/remote-msg.c index 756b710..2a4d7f1 100644 --- a/hw/i386/remote-msg.c +++ b/hw/i386/remote-msg.c @@ -26,6 +26,7 @@ static void process_config_read(QIOChannel *ioc, PCIDevice *dev, MPQemuMsg *msg); static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); +static void process_proxy_ping_msg(QIOChannel *ioc, Error **errp); gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond, gpointer opaque) @@ -75,6 +76,9 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond, case SET_IRQFD: process_set_irqfd_msg(pci_dev, &msg); break; + case PROXY_PING: + process_proxy_ping_msg(ioc, &local_err); + break; default: error_setg(&local_err, "Unknown command (%d) received for device %s (pid=%d)", @@ -223,3 +227,18 @@ fail: "in remote process pid=%d", getpid()); } } + +static void process_proxy_ping_msg(QIOChannel *ioc, Error **errp) +{ + MPQemuMsg ret = { 0 }; + Error *local_err = NULL; + + ret.cmd = RET_MSG; + ret.size = sizeof(ret.data1); + + mpqemu_msg_send(&ret, ioc, &local_err); + if (local_err) { + error_setg(errp, "Error while sending message to proxy " + "in remote process pid=%d", getpid()); + } +} diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c index 50a806c..490093c 100644 --- a/hw/pci/proxy.c +++ b/hw/pci/proxy.c @@ -24,6 +24,8 @@ #include "util/event_notifier-posix.c" static void probe_pci_info(PCIDevice *dev, Error **errp); +static void start_hb_timer(PCIProxyDev *dev); +static void stop_hb_timer(PCIProxyDev *dev); static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp) { @@ -111,6 +113,8 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp) setup_irqfd(dev); probe_pci_info(PCI_DEVICE(dev), errp); + + start_hb_timer(dev); } static void pci_proxy_dev_exit(PCIDevice *pdev) @@ -123,6 +127,8 @@ static void pci_proxy_dev_exit(PCIDevice *pdev) event_notifier_cleanup(&dev->intr); event_notifier_cleanup(&dev->resample); + + stop_hb_timer(dev); } static int config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val, @@ -343,3 +349,49 @@ static void probe_pci_info(PCIDevice *dev, Error **errp) } } } + +static void hb_msg(PCIProxyDev *dev) +{ + DeviceState *ds = DEVICE(dev); + Error *local_err = NULL; + MPQemuMsg msg = { 0 }; + + msg.cmd = PROXY_PING; + msg.bytestream = 0; + msg.size = 0; + + (void)mpqemu_msg_send_and_await_reply(&msg, dev->ioc, &local_err); + if (local_err) { + error_report_err(local_err); + qio_channel_close(dev->ioc, &local_err); + error_setg(&error_fatal, "Lost contact with device %s", ds->id); + } +} + +#define NOP_INTERVAL 1000 + +static void remote_ping(void *opaque) +{ + PCIProxyDev *dev = opaque; + + hb_msg(dev); + + timer_mod(dev->hb_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL); +} + +static void start_hb_timer(PCIProxyDev *dev) +{ + dev->hb_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + remote_ping, + dev); + + timer_mod(dev->hb_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + NOP_INTERVAL); +} + +static void stop_hb_timer(PCIProxyDev *dev) +{ + timer_del(dev->hb_timer); + timer_free(dev->hb_timer); +} diff --git a/include/hw/pci/proxy.h b/include/hw/pci/proxy.h index 15cc381..d784328 100644 --- a/include/hw/pci/proxy.h +++ b/include/hw/pci/proxy.h @@ -40,6 +40,8 @@ struct PCIProxyDev { EventNotifier intr; EventNotifier resample; + QEMUTimer *hb_timer; + ProxyMemoryRegion region[PCI_NUM_REGIONS]; }; diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h index a3844e1..5f2913f 100644 --- a/include/io/mpqemu-link.h +++ b/include/io/mpqemu-link.h @@ -40,6 +40,7 @@ typedef enum { BAR_READ, SET_IRQFD, GET_PCI_INFO, + PROXY_PING, MAX = INT_MAX, } MPQemuCmd; diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c index 6fa4665..0222e81 100644 --- a/io/mpqemu-link.c +++ b/io/mpqemu-link.c @@ -280,6 +280,14 @@ bool mpqemu_msg_valid(MPQemuMsg *msg) return false; } break; + case PROXY_PING: + if (msg->bytestream || msg->num_fds) { + return false; + } + if (msg->size) { + return false; + } + break; default: break; }