Message ID | 1565797908-5970-1-git-send-email-haiyangz@microsoft.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | [v5,1/2] PCI: hv: Detect and fix Hyper-V PCI domain number collision | expand |
On Wed, Aug 14, 2019 at 03:52:15PM +0000, Haiyang Zhang wrote: > Currently in Azure cloud, for passthrough devices, the host sets the device > instance ID's bytes 8 - 15 to a value derived from the host HWID, which is > the same on all devices in a VM. So, the device instance ID's bytes 8 and 9 > provided by the host are no longer unique. This affects all Azure hosts > since last year, and can cause device passthrough to VMs to fail because Bjorn already asked, can you be a bit more specific than "since last year" here please ? It would be useful to understand when/how this became an issue. > the bytes 8 and 9 are used as PCI domain number. Collision of domain > numbers will cause the second device with the same domain number fail to > load. > > In the cases of collision, we will detect and find another number that is > not in use. > > Suggested-by: Michael Kelley <mikelley@microsoft.com> > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com> > Acked-by: Sasha Levin <sashal@kernel.org> > --- > drivers/pci/controller/pci-hyperv.c | 92 +++++++++++++++++++++++++++++++------ > 1 file changed, 79 insertions(+), 13 deletions(-) > > diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c > index 40b6254..31b8fd5 100644 > --- a/drivers/pci/controller/pci-hyperv.c > +++ b/drivers/pci/controller/pci-hyperv.c > @@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus) > complete(&hbus->remove_event); > } > > +#define HVPCI_DOM_MAP_SIZE (64 * 1024) > +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); > + > +/* > + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 > + * as invalid for passthrough PCI devices of this driver. > + */ > +#define HVPCI_DOM_INVALID 0 > + > +/** > + * hv_get_dom_num() - Get a valid PCI domain number > + * Check if the PCI domain number is in use, and return another number if > + * it is in use. > + * > + * @dom: Requested domain number > + * > + * return: domain number on success, HVPCI_DOM_INVALID on failure > + */ > +static u16 hv_get_dom_num(u16 dom) > +{ > + unsigned int i; > + > + if (test_and_set_bit(dom, hvpci_dom_map) == 0) > + return dom; > + > + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { > + if (test_and_set_bit(i, hvpci_dom_map) == 0) > + return i; > + } Don't you need locking around code reading/updating hvpci_dom_map ? Thanks, Lorenzo > + > + return HVPCI_DOM_INVALID; > +} > + > +/** > + * hv_put_dom_num() - Mark the PCI domain number as free > + * @dom: Domain number to be freed > + */ > +static void hv_put_dom_num(u16 dom) > +{ > + clear_bit(dom, hvpci_dom_map); > +} > + > /** > * hv_pci_probe() - New VMBus channel probe, for a root PCI bus > * @hdev: VMBus's tracking struct for this root PCI bus > @@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev, > const struct hv_vmbus_device_id *dev_id) > { > struct hv_pcibus_device *hbus; > + u16 dom_req, dom; > int ret; > > /* > @@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev, > hbus->state = hv_pcibus_init; > > /* > - * The PCI bus "domain" is what is called "segment" in ACPI and > - * other specs. Pull it from the instance ID, to get something > - * unique. Bytes 8 and 9 are what is used in Windows guests, so > - * do the same thing for consistency. Note that, since this code > - * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee > - * that (1) the only domain in use for something that looks like > - * a physical PCI bus (which is actually emulated by the > - * hypervisor) is domain 0 and (2) there will be no overlap > - * between domains derived from these instance IDs in the same > - * VM. > + * The PCI bus "domain" is what is called "segment" in ACPI and other > + * specs. Pull it from the instance ID, to get something usually > + * unique. In rare cases of collision, we will find out another number > + * not in use. > + * > + * Note that, since this code only runs in a Hyper-V VM, Hyper-V > + * together with this guest driver can guarantee that (1) The only > + * domain used by Gen1 VMs for something that looks like a physical > + * PCI bus (which is actually emulated by the hypervisor) is domain 0. > + * (2) There will be no overlap between domains (after fixing possible > + * collisions) in the same VM. > */ > - hbus->sysdata.domain = hdev->dev_instance.b[9] | > - hdev->dev_instance.b[8] << 8; > + dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9]; > + dom = hv_get_dom_num(dom_req); > + > + if (dom == HVPCI_DOM_INVALID) { > + dev_err(&hdev->device, > + "Unable to use dom# 0x%hx or other numbers", dom_req); > + ret = -EINVAL; > + goto free_bus; > + } > + > + if (dom != dom_req) > + dev_info(&hdev->device, > + "PCI dom# 0x%hx has collision, using 0x%hx", > + dom_req, dom); > + > + hbus->sysdata.domain = dom; > > hbus->hdev = hdev; > refcount_set(&hbus->remove_lock, 1); > @@ -2562,7 +2620,7 @@ static int hv_pci_probe(struct hv_device *hdev, > hbus->sysdata.domain); > if (!hbus->wq) { > ret = -ENOMEM; > - goto free_bus; > + goto free_dom; > } > > ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, > @@ -2639,6 +2697,8 @@ static int hv_pci_probe(struct hv_device *hdev, > vmbus_close(hdev->channel); > destroy_wq: > destroy_workqueue(hbus->wq); > +free_dom: > + hv_put_dom_num(hbus->sysdata.domain); > free_bus: > free_page((unsigned long)hbus); > return ret; > @@ -2720,6 +2780,9 @@ static int hv_pci_remove(struct hv_device *hdev) > put_hvpcibus(hbus); > wait_for_completion(&hbus->remove_event); > destroy_workqueue(hbus->wq); > + > + hv_put_dom_num(hbus->sysdata.domain); > + > free_page((unsigned long)hbus); > return 0; > } > @@ -2747,6 +2810,9 @@ static void __exit exit_hv_pci_drv(void) > > static int __init init_hv_pci_drv(void) > { > + /* Set the invalid domain number's bit, so it will not be used */ > + set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); > + > return vmbus_driver_register(&hv_pci_drv); > } > > -- > 1.8.3.1 >
> -----Original Message----- > From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> > Sent: Thursday, August 15, 2019 12:11 PM > To: Haiyang Zhang <haiyangz@microsoft.com> > Cc: sashal@kernel.org; bhelgaas@google.com; linux- > hyperv@vger.kernel.org; linux-pci@vger.kernel.org; KY Srinivasan > <kys@microsoft.com>; Stephen Hemminger <sthemmin@microsoft.com>; > olaf@aepfle.de; vkuznets <vkuznets@redhat.com>; linux- > kernel@vger.kernel.org > Subject: Re: [PATCH v5,1/2] PCI: hv: Detect and fix Hyper-V PCI domain > number collision > > On Wed, Aug 14, 2019 at 03:52:15PM +0000, Haiyang Zhang wrote: > > Currently in Azure cloud, for passthrough devices, the host sets the device > > instance ID's bytes 8 - 15 to a value derived from the host HWID, which is > > the same on all devices in a VM. So, the device instance ID's bytes 8 and 9 > > provided by the host are no longer unique. This affects all Azure hosts > > since last year, and can cause device passthrough to VMs to fail because > > Bjorn already asked, can you be a bit more specific than "since last > year" here please ? > > It would be useful to understand when/how this became an issue. The host change happens around July 2018. The Azure roll out takes multi weeks, so there is no specific date. I will include the Month Year in the log. > > > the bytes 8 and 9 are used as PCI domain number. Collision of domain > > numbers will cause the second device with the same domain number fail to > > load. > > > > In the cases of collision, we will detect and find another number that is > > not in use. > > > > Suggested-by: Michael Kelley <mikelley@microsoft.com> > > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com> > > Acked-by: Sasha Levin <sashal@kernel.org> > > --- > > drivers/pci/controller/pci-hyperv.c | 92 > +++++++++++++++++++++++++++++++------ > > 1 file changed, 79 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci- > hyperv.c > > index 40b6254..31b8fd5 100644 > > --- a/drivers/pci/controller/pci-hyperv.c > > +++ b/drivers/pci/controller/pci-hyperv.c > > @@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct > hv_pcibus_device *hbus) > > complete(&hbus->remove_event); > > } > > > > +#define HVPCI_DOM_MAP_SIZE (64 * 1024) > > +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); > > + > > +/* > > + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so > define 0 > > + * as invalid for passthrough PCI devices of this driver. > > + */ > > +#define HVPCI_DOM_INVALID 0 > > + > > +/** > > + * hv_get_dom_num() - Get a valid PCI domain number > > + * Check if the PCI domain number is in use, and return another number if > > + * it is in use. > > + * > > + * @dom: Requested domain number > > + * > > + * return: domain number on success, HVPCI_DOM_INVALID on failure > > + */ > > +static u16 hv_get_dom_num(u16 dom) > > +{ > > + unsigned int i; > > > + > > + if (test_and_set_bit(dom, hvpci_dom_map) == 0) > > + return dom; > > + > > + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { > > + if (test_and_set_bit(i, hvpci_dom_map) == 0) > > + return i; > > + } > > Don't you need locking around code reading/updating hvpci_dom_map ? If the bit changes after for_each_clear_bit() considers it as a "clear bit" - the test_and_set_bit() does test&set in an atomic operation - the return value will be 1 instead of 0. Then the loop will continue to the next clear bit, until the test_and_set_bit() is successful. So no locking is necessary here. Thanks, - Haiyang
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 40b6254..31b8fd5 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus) complete(&hbus->remove_event); } +#define HVPCI_DOM_MAP_SIZE (64 * 1024) +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); + +/* + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 + * as invalid for passthrough PCI devices of this driver. + */ +#define HVPCI_DOM_INVALID 0 + +/** + * hv_get_dom_num() - Get a valid PCI domain number + * Check if the PCI domain number is in use, and return another number if + * it is in use. + * + * @dom: Requested domain number + * + * return: domain number on success, HVPCI_DOM_INVALID on failure + */ +static u16 hv_get_dom_num(u16 dom) +{ + unsigned int i; + + if (test_and_set_bit(dom, hvpci_dom_map) == 0) + return dom; + + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { + if (test_and_set_bit(i, hvpci_dom_map) == 0) + return i; + } + + return HVPCI_DOM_INVALID; +} + +/** + * hv_put_dom_num() - Mark the PCI domain number as free + * @dom: Domain number to be freed + */ +static void hv_put_dom_num(u16 dom) +{ + clear_bit(dom, hvpci_dom_map); +} + /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { struct hv_pcibus_device *hbus; + u16 dom_req, dom; int ret; /* @@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->state = hv_pcibus_init; /* - * The PCI bus "domain" is what is called "segment" in ACPI and - * other specs. Pull it from the instance ID, to get something - * unique. Bytes 8 and 9 are what is used in Windows guests, so - * do the same thing for consistency. Note that, since this code - * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee - * that (1) the only domain in use for something that looks like - * a physical PCI bus (which is actually emulated by the - * hypervisor) is domain 0 and (2) there will be no overlap - * between domains derived from these instance IDs in the same - * VM. + * The PCI bus "domain" is what is called "segment" in ACPI and other + * specs. Pull it from the instance ID, to get something usually + * unique. In rare cases of collision, we will find out another number + * not in use. + * + * Note that, since this code only runs in a Hyper-V VM, Hyper-V + * together with this guest driver can guarantee that (1) The only + * domain used by Gen1 VMs for something that looks like a physical + * PCI bus (which is actually emulated by the hypervisor) is domain 0. + * (2) There will be no overlap between domains (after fixing possible + * collisions) in the same VM. */ - hbus->sysdata.domain = hdev->dev_instance.b[9] | - hdev->dev_instance.b[8] << 8; + dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9]; + dom = hv_get_dom_num(dom_req); + + if (dom == HVPCI_DOM_INVALID) { + dev_err(&hdev->device, + "Unable to use dom# 0x%hx or other numbers", dom_req); + ret = -EINVAL; + goto free_bus; + } + + if (dom != dom_req) + dev_info(&hdev->device, + "PCI dom# 0x%hx has collision, using 0x%hx", + dom_req, dom); + + hbus->sysdata.domain = dom; hbus->hdev = hdev; refcount_set(&hbus->remove_lock, 1); @@ -2562,7 +2620,7 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->sysdata.domain); if (!hbus->wq) { ret = -ENOMEM; - goto free_bus; + goto free_dom; } ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, @@ -2639,6 +2697,8 @@ static int hv_pci_probe(struct hv_device *hdev, vmbus_close(hdev->channel); destroy_wq: destroy_workqueue(hbus->wq); +free_dom: + hv_put_dom_num(hbus->sysdata.domain); free_bus: free_page((unsigned long)hbus); return ret; @@ -2720,6 +2780,9 @@ static int hv_pci_remove(struct hv_device *hdev) put_hvpcibus(hbus); wait_for_completion(&hbus->remove_event); destroy_workqueue(hbus->wq); + + hv_put_dom_num(hbus->sysdata.domain); + free_page((unsigned long)hbus); return 0; } @@ -2747,6 +2810,9 @@ static void __exit exit_hv_pci_drv(void) static int __init init_hv_pci_drv(void) { + /* Set the invalid domain number's bit, so it will not be used */ + set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); + return vmbus_driver_register(&hv_pci_drv); }