diff mbox series

pciehp: fast unplug for virtual machines

Message ID 20211111090225.946381-1-kraxel@redhat.com (mailing list archive)
State Not Applicable
Delegated to: Bjorn Helgaas
Headers show
Series pciehp: fast unplug for virtual machines | expand

Commit Message

Gerd Hoffmann Nov. 11, 2021, 9:02 a.m. UTC
The PCIe specification asks the OS to wait five seconds after the
attention button has been pressed before actually un-plugging the
device.  This gives the operator the chance to cancel the operation
by pressing the attention button again within those five seconds.

For physical hardware this makes sense.  Picking the wrong button
by accident can easily happen and it can be corrected that way.

For virtual hardware the benefits are questionable.  Typically
users find the five second delay annoying.

This patch adds the fast_virtual_unplug module parameter to the
pciehp driver.  When enabled (which is the default) the linux
kernel will simply skip the delay for virtual pcie ports, which
reduces the total time for the unplug operation from 6-7 seconds
to 1-2 seconds.

Virtual pcie ports are identified by PCI ID.  For now the qemu
pcie root ports are detected, other hardware can be added easily.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/pci/hotplug/pciehp.h      |  3 +++
 drivers/pci/hotplug/pciehp_core.c |  5 +++++
 drivers/pci/hotplug/pciehp_ctrl.c | 11 ++++++++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin Nov. 11, 2021, 5 p.m. UTC | #1
On Thu, Nov 11, 2021 at 10:02:24AM +0100, Gerd Hoffmann wrote:
> The PCIe specification asks the OS to wait five seconds after the
> attention button has been pressed before actually un-plugging the
> device.  This gives the operator the chance to cancel the operation
> by pressing the attention button again within those five seconds.
> 
> For physical hardware this makes sense.  Picking the wrong button
> by accident can easily happen and it can be corrected that way.
> 
> For virtual hardware the benefits are questionable.  Typically
> users find the five second delay annoying.
> 
> This patch adds the fast_virtual_unplug module parameter to the
> pciehp driver.  When enabled (which is the default) the linux
> kernel will simply skip the delay for virtual pcie ports, which
> reduces the total time for the unplug operation from 6-7 seconds
> to 1-2 seconds.

BTW how come it's still taking seconds, not milliseconds?

> Virtual pcie ports are identified by PCI ID.  For now the qemu
> pcie root ports are detected, other hardware can be added easily.
> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  drivers/pci/hotplug/pciehp.h      |  3 +++
>  drivers/pci/hotplug/pciehp_core.c |  5 +++++
>  drivers/pci/hotplug/pciehp_ctrl.c | 11 ++++++++++-
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
> index 69fd401691be..131ffec2e947 100644
> --- a/drivers/pci/hotplug/pciehp.h
> +++ b/drivers/pci/hotplug/pciehp.h
> @@ -79,6 +79,7 @@ extern int pciehp_poll_time;
>   * @request_result: result of last user request submitted to the IRQ thread
>   * @requester: wait queue to wake up on completion of user request,
>   *	used for synchronous slot enable/disable request via sysfs
> + * @is_virtual: virtual machine pcie port.
>   *
>   * PCIe hotplug has a 1:1 relationship between controller and slot, hence
>   * unlike other drivers, the two aren't represented by separate structures.
> @@ -109,6 +110,8 @@ struct controller {
>  	unsigned int ist_running;
>  	int request_result;
>  	wait_queue_head_t requester;
> +
> +	bool is_virtual;
>  };
>  
>  /**
> diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
> index ad3393930ecb..28867ec33f5b 100644
> --- a/drivers/pci/hotplug/pciehp_core.c
> +++ b/drivers/pci/hotplug/pciehp_core.c
> @@ -227,6 +227,11 @@ static int pciehp_probe(struct pcie_device *dev)
>  		goto err_out_shutdown_notification;
>  	}
>  
> +	if (dev->port->vendor == PCI_VENDOR_ID_REDHAT &&
> +	    dev->port->device == 0x000c)
> +		/* qemu pcie root port */
> +		ctrl->is_virtual = true;
> +
>  	pciehp_check_presence(ctrl);
>  
>  	return 0;
> diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
> index 529c34808440..119bb11f87d6 100644
> --- a/drivers/pci/hotplug/pciehp_ctrl.c
> +++ b/drivers/pci/hotplug/pciehp_ctrl.c
> @@ -15,12 +15,17 @@
>  
>  #define dev_fmt(fmt) "pciehp: " fmt
>  
> +#include <linux/moduleparam.h>
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/pm_runtime.h>
>  #include <linux/pci.h>
>  #include "pciehp.h"
>  
> +static bool fast_virtual_unplug = true;
> +module_param(fast_virtual_unplug, bool, 0644);
> +MODULE_PARM_DESC(fast_virtual_unplug, "Fast unplug (don't wait 5 seconds) for virtual machines.");
> +
>  /* The following routines constitute the bulk of the
>     hotplug controller logic
>   */
> @@ -176,7 +181,11 @@ void pciehp_handle_button_press(struct controller *ctrl)
>  		/* blink power indicator and turn off attention */
>  		pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK,
>  				      PCI_EXP_SLTCTL_ATTN_IND_OFF);
> -		schedule_delayed_work(&ctrl->button_work, 5 * HZ);
> +		if (ctrl->is_virtual && fast_virtual_unplug) {
> +			schedule_delayed_work(&ctrl->button_work, 1);
> +		} else {
> +			schedule_delayed_work(&ctrl->button_work, 5 * HZ);
> +		}
>  		break;
>  	case BLINKINGOFF_STATE:
>  	case BLINKINGON_STATE:
> -- 
> 2.33.1
Bjorn Helgaas Nov. 11, 2021, 9:50 p.m. UTC | #2
[+cc Lukas]

If you update or repost this, please note and follow previous history
of subject lines, e.g., "git log --oneline drivers/pci/hotplug/pciehp*".

On Thu, Nov 11, 2021 at 10:02:24AM +0100, Gerd Hoffmann wrote:
> The PCIe specification asks the OS to wait five seconds after the
> attention button has been pressed before actually un-plugging the
> device.  This gives the operator the chance to cancel the operation
> by pressing the attention button again within those five seconds.

Would be nice to include the specific reference here (spec name,
revision, and section).

> For physical hardware this makes sense.  Picking the wrong button
> by accident can easily happen and it can be corrected that way.
> 
> For virtual hardware the benefits are questionable.  Typically
> users find the five second delay annoying.
> 
> This patch adds the fast_virtual_unplug module parameter to the
> pciehp driver.  When enabled (which is the default) the linux
> kernel will simply skip the delay for virtual pcie ports, which
> reduces the total time for the unplug operation from 6-7 seconds
> to 1-2 seconds.

Include an example of use, e.g., "pci=fast_virtual_unplug" or
whatever.  Also probably update
Documentation/admin-guide/kernel-parameters.txt.

"Virtual" doesn't seem like quite the right descriptor here.  That's
one use case, but I think the parameter should describe the actual
*effect*, not the use case, e.g., something related to the delay after
the attention button.

If it's practical, I think it would be nicer to have a sysfs attribute
instead of a kernel boot parameter.  Then we wouldn't have to reboot
to change this, and it could be generalized to allow arbitrary delays,
i.e., no delay, 5 seconds, or whatever the admin decides.

I really hate the portdrv sysfs structure (pcie004, etc.) though, so I
would avoid putting it there if possible.  I don't know if other
hotplug drivers have similar delays.  If they do, would be nice to at
least have the potential for the mechanism to work for them all, even
if we don't implement it for all of them.

> Virtual pcie ports are identified by PCI ID.  For now the qemu
> pcie root ports are detected, other hardware can be added easily.

s/pcie/PCIe/ in English text and comments (above and below).

> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  drivers/pci/hotplug/pciehp.h      |  3 +++
>  drivers/pci/hotplug/pciehp_core.c |  5 +++++
>  drivers/pci/hotplug/pciehp_ctrl.c | 11 ++++++++++-
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
> index 69fd401691be..131ffec2e947 100644
> --- a/drivers/pci/hotplug/pciehp.h
> +++ b/drivers/pci/hotplug/pciehp.h
> @@ -79,6 +79,7 @@ extern int pciehp_poll_time;
>   * @request_result: result of last user request submitted to the IRQ thread
>   * @requester: wait queue to wake up on completion of user request,
>   *	used for synchronous slot enable/disable request via sysfs
> + * @is_virtual: virtual machine pcie port.
>   *
>   * PCIe hotplug has a 1:1 relationship between controller and slot, hence
>   * unlike other drivers, the two aren't represented by separate structures.
> @@ -109,6 +110,8 @@ struct controller {
>  	unsigned int ist_running;
>  	int request_result;
>  	wait_queue_head_t requester;
> +
> +	bool is_virtual;

Most (but sadly not all) previous similar flags are "unsigned int :1".
None are bool.

>  };
>  
>  /**
> diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
> index ad3393930ecb..28867ec33f5b 100644
> --- a/drivers/pci/hotplug/pciehp_core.c
> +++ b/drivers/pci/hotplug/pciehp_core.c
> @@ -227,6 +227,11 @@ static int pciehp_probe(struct pcie_device *dev)
>  		goto err_out_shutdown_notification;
>  	}
>  
> +	if (dev->port->vendor == PCI_VENDOR_ID_REDHAT &&
> +	    dev->port->device == 0x000c)
> +		/* qemu pcie root port */
> +		ctrl->is_virtual = true;
> +
>  	pciehp_check_presence(ctrl);
>  
>  	return 0;
> diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
> index 529c34808440..119bb11f87d6 100644
> --- a/drivers/pci/hotplug/pciehp_ctrl.c
> +++ b/drivers/pci/hotplug/pciehp_ctrl.c
> @@ -15,12 +15,17 @@
>  
>  #define dev_fmt(fmt) "pciehp: " fmt
>  
> +#include <linux/moduleparam.h>
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/pm_runtime.h>
>  #include <linux/pci.h>
>  #include "pciehp.h"
>  
> +static bool fast_virtual_unplug = true;
> +module_param(fast_virtual_unplug, bool, 0644);
> +MODULE_PARM_DESC(fast_virtual_unplug, "Fast unplug (don't wait 5 seconds) for virtual machines.");
> +
>  /* The following routines constitute the bulk of the
>     hotplug controller logic
>   */
> @@ -176,7 +181,11 @@ void pciehp_handle_button_press(struct controller *ctrl)
>  		/* blink power indicator and turn off attention */
>  		pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK,
>  				      PCI_EXP_SLTCTL_ATTN_IND_OFF);
> -		schedule_delayed_work(&ctrl->button_work, 5 * HZ);
> +		if (ctrl->is_virtual && fast_virtual_unplug) {
> +			schedule_delayed_work(&ctrl->button_work, 1);

Why schedule_delayed_work() instead of schedule_work()?  And if
schedule_delayed_work(), why a delay of 1 instead of 0?

> +		} else {
> +			schedule_delayed_work(&ctrl->button_work, 5 * HZ);
> +		}
>  		break;
>  	case BLINKINGOFF_STATE:
>  	case BLINKINGON_STATE:
> -- 
> 2.33.1
>
Krzysztof WilczyƄski Nov. 12, 2021, midnight UTC | #3
Hi Gerd,

> The PCIe specification asks the OS to wait five seconds after the
> attention button has been pressed before actually un-plugging the
> device.  This gives the operator the chance to cancel the operation
> by pressing the attention button again within those five seconds.
> 
> For physical hardware this makes sense.  Picking the wrong button
> by accident can easily happen and it can be corrected that way.
> 
> For virtual hardware the benefits are questionable.  Typically
> users find the five second delay annoying.
> 
> This patch adds the fast_virtual_unplug module parameter to the
> pciehp driver.  When enabled (which is the default) the linux
> kernel will simply skip the delay for virtual pcie ports, which
> reduces the total time for the unplug operation from 6-7 seconds
> to 1-2 seconds.
> 
> Virtual pcie ports are identified by PCI ID.  For now the qemu
> pcie root ports are detected, other hardware can be added easily.

Bjorn asked to correct the PCIe references, whereas I would ask you to
change "linux" to "Linux", "qemu" to "QEMU", and generally capitalise
things where and as needed throughout.

>   * @request_result: result of last user request submitted to the IRQ thread
>   * @requester: wait queue to wake up on completion of user request,
>   *	used for synchronous slot enable/disable request via sysfs
> + * @is_virtual: virtual machine pcie port.

It would be "PCIe" here too for consistency.

> +	if (dev->port->vendor == PCI_VENDOR_ID_REDHAT &&
> +	    dev->port->device == 0x000c)
> +		/* qemu pcie root port */
> +		ctrl->is_virtual = true;

I would personally move the comment above the if-statement as it looks
a little awkward added like that.  You could also add a little bit more
detail why this is set for QEMU and what it's needed, etc.  Also, if you
don't mind, let's change it to "QEMU" in the comment for consistency.

> -		schedule_delayed_work(&ctrl->button_work, 5 * HZ);
> +		if (ctrl->is_virtual && fast_virtual_unplug) {
> +			schedule_delayed_work(&ctrl->button_work, 1);
> +		} else {
> +			schedule_delayed_work(&ctrl->button_work, 5 * HZ);
> +		}

The brackets are fine but technically not needed above as per the kernel
coding style.

	Krzysztof
Gerd Hoffmann Nov. 12, 2021, 9:56 a.m. UTC | #4
Hi,

> > This patch adds the fast_virtual_unplug module parameter to the
> > pciehp driver.  When enabled (which is the default) the linux
> > kernel will simply skip the delay for virtual pcie ports, which
> > reduces the total time for the unplug operation from 6-7 seconds
> > to 1-2 seconds.
> 
> BTW how come it's still taking seconds, not milliseconds?

I've tackled the 5 seconds only, biggest chunk and easy target because
the only reason to have that is to allow operators press the attention
button again to cancel, so the risk to break something here is rather
low.

There are some more wait times elsewhere, to give the hardware the
time needed when powering up/down slots, which sum up to roughly one
second, and the time the driver needs to shutdown the device goes on
top of that (typically not much).

take care,
  Gerd
Michael S. Tsirkin Nov. 12, 2021, 10:09 a.m. UTC | #5
On Fri, Nov 12, 2021 at 10:56:29AM +0100, Gerd Hoffmann wrote:
>   Hi,
> 
> > > This patch adds the fast_virtual_unplug module parameter to the
> > > pciehp driver.  When enabled (which is the default) the linux
> > > kernel will simply skip the delay for virtual pcie ports, which
> > > reduces the total time for the unplug operation from 6-7 seconds
> > > to 1-2 seconds.
> > 
> > BTW how come it's still taking seconds, not milliseconds?
> 
> I've tackled the 5 seconds only, biggest chunk and easy target because
> the only reason to have that is to allow operators press the attention
> button again to cancel, so the risk to break something here is rather
> low.
> 
> There are some more wait times elsewhere, to give the hardware the
> time needed when powering up/down slots, which sum up to roughly one
> second, and the time the driver needs to shutdown the device goes on
> top of that (typically not much).
> 
> take care,
>   Gerd

Probably also unnecessary for a virtual bridge. We'll need to fix these
up if we want native hotplug to be useful for workloads like kata -
these expect hotplug time in the milliseconds.
Gerd Hoffmann Nov. 12, 2021, 11:28 a.m. UTC | #6
Hi,

> "Virtual" doesn't seem like quite the right descriptor here.  That's
> one use case, but I think the parameter should describe the actual
> *effect*, not the use case, e.g., something related to the delay after
> the attention button.

Well, it's enabled only for virtual pcie ports because I could hardly
enable it by default otherwise.

There is another delay in remove_board(), mst suggests removing that
too, and that surely would be something we can't do on physical
hardware ...

So I'd prefer to keep the name.

> If it's practical, I think it would be nicer to have a sysfs attribute
> instead of a kernel boot parameter.  Then we wouldn't have to reboot
> to change this, and it could be generalized to allow arbitrary delays,
> i.e., no delay, 5 seconds, or whatever the admin decides.

It's a module parameter so will show up in
/sys/module/pciehp/parameters/ and it is writable by root.

[ will address the other things mentioned in v2 ].

take care,
  Gerd
Lukas Wunner Nov. 14, 2021, 4:39 p.m. UTC | #7
On Thu, Nov 11, 2021 at 10:02:24AM +0100, Gerd Hoffmann wrote:
> The PCIe specification asks the OS to wait five seconds after the

The spec reference Bjorn asked for is: PCIe r5.0, sec. 6.7.1.5

> attention button has been pressed before actually un-plugging the
> device.  This gives the operator the chance to cancel the operation
> by pressing the attention button again within those five seconds.
> 
> For physical hardware this makes sense.  Picking the wrong button
> by accident can easily happen and it can be corrected that way.
> 
> For virtual hardware the benefits are questionable.  Typically
> users find the five second delay annoying.

Why does virtual hardware implement the Attention Button if it's
perceived as annoying?  Just amend qemu so that it doesn't advertise
presence of an Attention Button to get rid of the delay.  (Clear the
Attention Button Present bit in the Slot Capabilities register.)

An Attention Button doesn't make any sense for virtual hardware
except to test or debug support for it in the kernel.  Just make
presence of the Attention Button optional and be done with it.

You'll still be able to bring down the slot in software via the
"remove" attribute in sysfs.

Same for the 1 second delay in remove_board().  That's mandated by
PCIe r5.0, sec. 6.7.1.8, but it's only observed if a Power Controller
is present.  So just clear the Power Controller Present bit in the
Slot Capabilities register and the delay is gone.


> @@ -109,6 +110,8 @@ struct controller {
>  	unsigned int ist_running;
>  	int request_result;
>  	wait_queue_head_t requester;
> +
> +	bool is_virtual;
>  };

This is a quirk for a specific device, so please move it further up to the
/* capabilities and quirks */ section of struct controller.


> @@ -227,6 +227,11 @@ static int pciehp_probe(struct pcie_device *dev)
>  		goto err_out_shutdown_notification;
>  	}
>  
> +	if (dev->port->vendor == PCI_VENDOR_ID_REDHAT &&
> +	    dev->port->device == 0x000c)
> +		/* qemu pcie root port */
> +		ctrl->is_virtual = true;
> +

Move this to pcie_init() in pciehp_hpc.c below the existing quirks for
hotplug_user_indicators and is_thunderbolt.


> +static bool fast_virtual_unplug = true;
> +module_param(fast_virtual_unplug, bool, 0644);

An integer parameter to configure a custom delay would be nicer IMO.
Of course, anything else than 5 sec deviates from the spec.

Thanks,

Lukas
Michael S. Tsirkin Nov. 14, 2021, 5:24 p.m. UTC | #8
On Sun, Nov 14, 2021 at 05:39:58PM +0100, Lukas Wunner wrote:
> On Thu, Nov 11, 2021 at 10:02:24AM +0100, Gerd Hoffmann wrote:
> > The PCIe specification asks the OS to wait five seconds after the
> 
> The spec reference Bjorn asked for is: PCIe r5.0, sec. 6.7.1.5
> 
> > attention button has been pressed before actually un-plugging the
> > device.  This gives the operator the chance to cancel the operation
> > by pressing the attention button again within those five seconds.
> > 
> > For physical hardware this makes sense.  Picking the wrong button
> > by accident can easily happen and it can be corrected that way.
> > 
> > For virtual hardware the benefits are questionable.  Typically
> > users find the five second delay annoying.
> 
> Why does virtual hardware implement the Attention Button if it's
> perceived as annoying?  Just amend qemu so that it doesn't advertise
> presence of an Attention Button to get rid of the delay.  (Clear the
> Attention Button Present bit in the Slot Capabilities register.)

Because we want ability to request device removal from outside the
guest.

> An Attention Button doesn't make any sense for virtual hardware
> except to test or debug support for it in the kernel.  Just make
> presence of the Attention Button optional and be done with it.
> 
> You'll still be able to bring down the slot in software via the
> "remove" attribute in sysfs.

This requires guest specific code though. Emulating the attention button
works in a guest independent way.

> Same for the 1 second delay in remove_board().  That's mandated by
> PCIe r5.0, sec. 6.7.1.8, but it's only observed if a Power Controller
> is present.  So just clear the Power Controller Present bit in the
> Slot Capabilities register and the delay is gone.
> 
> 
> > @@ -109,6 +110,8 @@ struct controller {
> >  	unsigned int ist_running;
> >  	int request_result;
> >  	wait_queue_head_t requester;
> > +
> > +	bool is_virtual;
> >  };
> 
> This is a quirk for a specific device, so please move it further up to the
> /* capabilities and quirks */ section of struct controller.
> 
> 
> > @@ -227,6 +227,11 @@ static int pciehp_probe(struct pcie_device *dev)
> >  		goto err_out_shutdown_notification;
> >  	}
> >  
> > +	if (dev->port->vendor == PCI_VENDOR_ID_REDHAT &&
> > +	    dev->port->device == 0x000c)
> > +		/* qemu pcie root port */
> > +		ctrl->is_virtual = true;
> > +
> 
> Move this to pcie_init() in pciehp_hpc.c below the existing quirks for
> hotplug_user_indicators and is_thunderbolt.
> 
> 
> > +static bool fast_virtual_unplug = true;
> > +module_param(fast_virtual_unplug, bool, 0644);
> 
> An integer parameter to configure a custom delay would be nicer IMO.
> Of course, anything else than 5 sec deviates from the spec.
> 
> Thanks,
> 
> Lukas
Lukas Wunner Nov. 14, 2021, 6:06 p.m. UTC | #9
On Sun, Nov 14, 2021 at 12:24:43PM -0500, Michael S. Tsirkin wrote:
> On Sun, Nov 14, 2021 at 05:39:58PM +0100, Lukas Wunner wrote:
> > Why does virtual hardware implement the Attention Button if it's
> > perceived as annoying?  Just amend qemu so that it doesn't advertise
> > presence of an Attention Button to get rid of the delay.  (Clear the
> > Attention Button Present bit in the Slot Capabilities register.)
> 
> Because we want ability to request device removal from outside the
> guest.

Please elaborate.  Does "outside the guest" mean on the host?
How do you represent the Attention Button outside the guest
and route events through to the guest?


> > An Attention Button doesn't make any sense for virtual hardware
> > except to test or debug support for it in the kernel.  Just make
> > presence of the Attention Button optional and be done with it.
> > 
> > You'll still be able to bring down the slot in software via the
> > "remove" attribute in sysfs.
> 
> This requires guest specific code though. Emulating the attention button
> works in a guest independent way.

It sounds like you're using the Attention Button because it does
almost, but not quite what you want for your specific use case.
Now you're trying to change its behavior in a way that deviates
from the spec to align it with your use case.

Why don't you just trigger surprise-removal from outside the guest?

Thanks,

Lukas
Michael S. Tsirkin Nov. 14, 2021, 10:37 p.m. UTC | #10
On Sun, Nov 14, 2021 at 07:06:04PM +0100, Lukas Wunner wrote:
> On Sun, Nov 14, 2021 at 12:24:43PM -0500, Michael S. Tsirkin wrote:
> > On Sun, Nov 14, 2021 at 05:39:58PM +0100, Lukas Wunner wrote:
> > > Why does virtual hardware implement the Attention Button if it's
> > > perceived as annoying?  Just amend qemu so that it doesn't advertise
> > > presence of an Attention Button to get rid of the delay.  (Clear the
> > > Attention Button Present bit in the Slot Capabilities register.)
> > 
> > Because we want ability to request device removal from outside the
> > guest.
> 
> Please elaborate.  Does "outside the guest" mean on the host?
> How do you represent the Attention Button outside the guest
> and route events through to the guest?

The usual way, using kvm ioctls.

> 
> > > An Attention Button doesn't make any sense for virtual hardware
> > > except to test or debug support for it in the kernel.  Just make
> > > presence of the Attention Button optional and be done with it.
> > > 
> > > You'll still be able to bring down the slot in software via the
> > > "remove" attribute in sysfs.
> > 
> > This requires guest specific code though. Emulating the attention button
> > works in a guest independent way.
> 
> It sounds like you're using the Attention Button because it does
> almost, but not quite what you want for your specific use case.
> Now you're trying to change its behavior in a way that deviates
> from the spec to align it with your use case.
> 
> Why don't you just trigger surprise-removal from outside the guest?
> 
> Thanks,
> 
> Lukas

Because linux does not handle it well for all devices.  Fixing that
requires fixing all drivers.
Gerd Hoffmann Nov. 15, 2021, 9:59 a.m. UTC | #11
Hi,

> > This requires guest specific code though. Emulating the attention button
> > works in a guest independent way.
> 
> It sounds like you're using the Attention Button because it does
> almost, but not quite what you want for your specific use case.

Well, we want send a request to the guest to shutdown and poweroff the
device.

> Why don't you just trigger surprise-removal from outside the guest?

To give the guest a chance to shutdown the device gracefully?
Umount filesystems, flush data to disk.

Also guest stability.  Traditionally linux isn't very good at dealing
with surprise removal, although the situation is improving.

Emulating surprise-removal has been discussed too.  More as fallback
in case the guest doesn't respond to the attention button.  Right now
we don't have it but should be doable.  For some kinds of devices it
shouldn't be a problem to go straight to surprise-removal.

I'll go experiment with that.

take care,
  Gerd
Gerd Hoffmann Nov. 15, 2021, 10:09 a.m. UTC | #12
Hi,

> Same for the 1 second delay in remove_board().  That's mandated by
> PCIe r5.0, sec. 6.7.1.8, but it's only observed if a Power Controller
> is present.  So just clear the Power Controller Present bit in the
> Slot Capabilities register and the delay is gone.

Well, the power control bit is a useful data channel.  qemu can use that
to figure whenever the guest uses the device (power is on) or not (power
is off).  And in case power is off anyway we can simply remove the
device without the attention button dance.

take care,
  Gerd
diff mbox series

Patch

diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 69fd401691be..131ffec2e947 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -79,6 +79,7 @@  extern int pciehp_poll_time;
  * @request_result: result of last user request submitted to the IRQ thread
  * @requester: wait queue to wake up on completion of user request,
  *	used for synchronous slot enable/disable request via sysfs
+ * @is_virtual: virtual machine pcie port.
  *
  * PCIe hotplug has a 1:1 relationship between controller and slot, hence
  * unlike other drivers, the two aren't represented by separate structures.
@@ -109,6 +110,8 @@  struct controller {
 	unsigned int ist_running;
 	int request_result;
 	wait_queue_head_t requester;
+
+	bool is_virtual;
 };
 
 /**
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index ad3393930ecb..28867ec33f5b 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -227,6 +227,11 @@  static int pciehp_probe(struct pcie_device *dev)
 		goto err_out_shutdown_notification;
 	}
 
+	if (dev->port->vendor == PCI_VENDOR_ID_REDHAT &&
+	    dev->port->device == 0x000c)
+		/* qemu pcie root port */
+		ctrl->is_virtual = true;
+
 	pciehp_check_presence(ctrl);
 
 	return 0;
diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index 529c34808440..119bb11f87d6 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -15,12 +15,17 @@ 
 
 #define dev_fmt(fmt) "pciehp: " fmt
 
+#include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/pm_runtime.h>
 #include <linux/pci.h>
 #include "pciehp.h"
 
+static bool fast_virtual_unplug = true;
+module_param(fast_virtual_unplug, bool, 0644);
+MODULE_PARM_DESC(fast_virtual_unplug, "Fast unplug (don't wait 5 seconds) for virtual machines.");
+
 /* The following routines constitute the bulk of the
    hotplug controller logic
  */
@@ -176,7 +181,11 @@  void pciehp_handle_button_press(struct controller *ctrl)
 		/* blink power indicator and turn off attention */
 		pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK,
 				      PCI_EXP_SLTCTL_ATTN_IND_OFF);
-		schedule_delayed_work(&ctrl->button_work, 5 * HZ);
+		if (ctrl->is_virtual && fast_virtual_unplug) {
+			schedule_delayed_work(&ctrl->button_work, 1);
+		} else {
+			schedule_delayed_work(&ctrl->button_work, 5 * HZ);
+		}
 		break;
 	case BLINKINGOFF_STATE:
 	case BLINKINGON_STATE: