diff mbox series

[v5,01/17] hw/vfio/pci: Ensure MSI and MSI-X do not overlap

Message ID 20221028122629.3269-2-akihiko.odaki@daynix.com (mailing list archive)
State New, archived
Headers show
Series pci: Abort if pci_add_capability fails | expand

Commit Message

Akihiko Odaki Oct. 28, 2022, 12:26 p.m. UTC
vfio_add_std_cap() is designed to ensure that capabilities do not
overlap, but it failed to do so for MSI and MSI-X capabilities.

Ensure MSI and MSI-X capabilities do not overlap with others by omitting
other overlapping capabilities.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
---
 hw/vfio/pci.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------
 hw/vfio/pci.h |  3 +++
 2 files changed, 56 insertions(+), 10 deletions(-)

Comments

Alex Williamson Oct. 28, 2022, 2:16 p.m. UTC | #1
On Fri, 28 Oct 2022 21:26:13 +0900
Akihiko Odaki <akihiko.odaki@daynix.com> wrote:

> vfio_add_std_cap() is designed to ensure that capabilities do not
> overlap, but it failed to do so for MSI and MSI-X capabilities.
> 
> Ensure MSI and MSI-X capabilities do not overlap with others by omitting
> other overlapping capabilities.
> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> ---
>  hw/vfio/pci.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------
>  hw/vfio/pci.h |  3 +++
>  2 files changed, 56 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 939dcc3d4a..36c8f3dc85 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1278,23 +1278,42 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
>      }
>  }
>  
> -static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> +static void vfio_msi_early_setup(VFIOPCIDevice *vdev, Error **errp)
>  {
>      uint16_t ctrl;
> -    bool msi_64bit, msi_maskbit;
> -    int ret, entries;
> -    Error *err = NULL;
> +    uint8_t pos;
> +
> +    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSI);
> +    if (!pos) {
> +        return;
> +    }
>  
>      if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
>                vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
>          error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
> -        return -errno;
> +        return;
>      }
> -    ctrl = le16_to_cpu(ctrl);
> +    vdev->msi_pos = pos;
> +    vdev->msi_ctrl = le16_to_cpu(ctrl);
>  
> -    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
> -    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
> -    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
> +    vdev->msi_cap_size = 0xa;
> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT)) {
> +        vdev->msi_cap_size += 0xa;
> +    }
> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT)) {
> +        vdev->msi_cap_size += 0x4;
> +    }
> +}
> +
> +static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> +{
> +    bool msi_64bit, msi_maskbit;
> +    int ret, entries;
> +    Error *err = NULL;
> +
> +    msi_64bit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT);
> +    msi_maskbit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT);
> +    entries = 1 << ((vdev->msi_ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
>  
>      trace_vfio_msi_setup(vdev->vbasedev.name, pos);
>  
> @@ -1306,7 +1325,6 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>          error_propagate_prepend(errp, err, "msi_init failed: ");
>          return ret;
>      }
> -    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
>  
>      return 0;
>  }
> @@ -1524,6 +1542,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>      pba = le32_to_cpu(pba);
>  
>      msix = g_malloc0(sizeof(*msix));
> +    msix->pos = pos;
>      msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
>      msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
>      msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
> @@ -2025,6 +2044,24 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
>          }
>      }
>  
> +    if (cap_id != PCI_CAP_ID_MSI &&
> +        range_covers_byte(vdev->msi_pos, vdev->msi_cap_size, pos)) {
> +        warn_report(VFIO_MSG_PREFIX
> +                    "A capability overlaps with MSI, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
> +                    vdev->vbasedev.name, cap_id, pos,
> +                    vdev->msi_pos, vdev->msi_pos + vdev->msi_cap_size);
> +        return 0;
> +    }
> +
> +    if (cap_id != PCI_CAP_ID_MSIX && vdev->msix &&
> +        range_covers_byte(vdev->msix->pos, MSIX_CAP_LENGTH, pos)) {
> +        warn_report(VFIO_MSG_PREFIX
> +                    "A capability overlaps with MSI-X, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
> +                    vdev->vbasedev.name, cap_id, pos,
> +                    vdev->msix->pos, vdev->msix->pos + MSIX_CAP_LENGTH);
> +        return 0;
> +    }

Capabilities are not a single byte, the fact that it doesn't start
within the MSI or MSI-X capability is not a sufficient test.  We're
also choosing to prioritize MSI and MSI-X capabilities by protecting
that range rather than the existing behavior where we'd drop those
capabilities if they overlap with another capability that has already
been placed.  There are merits to both approaches, but I don't see any
justification here to change the current behavior.

Isn't the most similar behavior to existing to pass the available size
to vfio_msi[x]_setup() and return an errno if the size would be
exceeded?  Something like below (untested, and requires exporting
msi_cap_sizeof()).  Thanks,

Alex

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 939dcc3d4a9e..485f9bc5102d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1278,11 +1278,13 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
     }
 }
 
-static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
+static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos,
+                          uint8_t size, Error **errp)
 {
     uint16_t ctrl;
     bool msi_64bit, msi_maskbit;
     int ret, entries;
+    uint8_t msi_cap_size;
     Error *err = NULL;
 
     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
@@ -1295,6 +1297,10 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
+    msi_cap_size = msi_cap_sizeof(ctrl);
+
+    if (msi_cap_size > size)
+	    return -ENOSPC;
 
     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
 
@@ -1306,7 +1312,7 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
         error_propagate_prepend(errp, err, "msi_init failed: ");
         return ret;
     }
-    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
+    vdev->msi_cap_size = msi_cap_size;
 
     return 0;
 }
@@ -1570,11 +1576,15 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
     vfio_pci_relocate_msix(vdev, errp);
 }
 
-static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
+static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos,
+                           uint8_t size, Error **errp)
 {
     int ret;
     Error *err = NULL;
 
+    if (MSIX_CAP_LENGTH > size)
+	    return -ENOSPC;
+
     vdev->msix->pending = g_new0(unsigned long,
                                  BITS_TO_LONGS(vdev->msix->entries));
     ret = msix_init(&vdev->pdev, vdev->msix->entries,
@@ -2033,14 +2043,14 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
 
     switch (cap_id) {
     case PCI_CAP_ID_MSI:
-        ret = vfio_msi_setup(vdev, pos, errp);
+        ret = vfio_msi_setup(vdev, pos, size, errp);
         break;
     case PCI_CAP_ID_EXP:
         vfio_check_pcie_flr(vdev, pos);
         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
         break;
     case PCI_CAP_ID_MSIX:
-        ret = vfio_msix_setup(vdev, pos, errp);
+        ret = vfio_msix_setup(vdev, pos, size, errp);
         break;
     case PCI_CAP_ID_PM:
         vfio_check_pm_reset(vdev, pos);
Akihiko Odaki Oct. 28, 2022, 4:12 p.m. UTC | #2
On 2022/10/28 23:16, Alex Williamson wrote:
> On Fri, 28 Oct 2022 21:26:13 +0900
> Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> 
>> vfio_add_std_cap() is designed to ensure that capabilities do not
>> overlap, but it failed to do so for MSI and MSI-X capabilities.
>>
>> Ensure MSI and MSI-X capabilities do not overlap with others by omitting
>> other overlapping capabilities.
>>
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>> ---
>>   hw/vfio/pci.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------
>>   hw/vfio/pci.h |  3 +++
>>   2 files changed, 56 insertions(+), 10 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 939dcc3d4a..36c8f3dc85 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -1278,23 +1278,42 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
>>       }
>>   }
>>   
>> -static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>> +static void vfio_msi_early_setup(VFIOPCIDevice *vdev, Error **errp)
>>   {
>>       uint16_t ctrl;
>> -    bool msi_64bit, msi_maskbit;
>> -    int ret, entries;
>> -    Error *err = NULL;
>> +    uint8_t pos;
>> +
>> +    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSI);
>> +    if (!pos) {
>> +        return;
>> +    }
>>   
>>       if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
>>                 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
>>           error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
>> -        return -errno;
>> +        return;
>>       }
>> -    ctrl = le16_to_cpu(ctrl);
>> +    vdev->msi_pos = pos;
>> +    vdev->msi_ctrl = le16_to_cpu(ctrl);
>>   
>> -    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
>> -    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
>> -    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
>> +    vdev->msi_cap_size = 0xa;
>> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT)) {
>> +        vdev->msi_cap_size += 0xa;
>> +    }
>> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT)) {
>> +        vdev->msi_cap_size += 0x4;
>> +    }
>> +}
>> +
>> +static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>> +{
>> +    bool msi_64bit, msi_maskbit;
>> +    int ret, entries;
>> +    Error *err = NULL;
>> +
>> +    msi_64bit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT);
>> +    msi_maskbit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT);
>> +    entries = 1 << ((vdev->msi_ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
>>   
>>       trace_vfio_msi_setup(vdev->vbasedev.name, pos);
>>   
>> @@ -1306,7 +1325,6 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>>           error_propagate_prepend(errp, err, "msi_init failed: ");
>>           return ret;
>>       }
>> -    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
>>   
>>       return 0;
>>   }
>> @@ -1524,6 +1542,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>>       pba = le32_to_cpu(pba);
>>   
>>       msix = g_malloc0(sizeof(*msix));
>> +    msix->pos = pos;
>>       msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
>>       msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
>>       msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
>> @@ -2025,6 +2044,24 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
>>           }
>>       }
>>   
>> +    if (cap_id != PCI_CAP_ID_MSI &&
>> +        range_covers_byte(vdev->msi_pos, vdev->msi_cap_size, pos)) {
>> +        warn_report(VFIO_MSG_PREFIX
>> +                    "A capability overlaps with MSI, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
>> +                    vdev->vbasedev.name, cap_id, pos,
>> +                    vdev->msi_pos, vdev->msi_pos + vdev->msi_cap_size);
>> +        return 0;
>> +    }
>> +
>> +    if (cap_id != PCI_CAP_ID_MSIX && vdev->msix &&
>> +        range_covers_byte(vdev->msix->pos, MSIX_CAP_LENGTH, pos)) {
>> +        warn_report(VFIO_MSG_PREFIX
>> +                    "A capability overlaps with MSI-X, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
>> +                    vdev->vbasedev.name, cap_id, pos,
>> +                    vdev->msix->pos, vdev->msix->pos + MSIX_CAP_LENGTH);
>> +        return 0;
>> +    }
> 
> Capabilities are not a single byte, the fact that it doesn't start
> within the MSI or MSI-X capability is not a sufficient test.  We're
> also choosing to prioritize MSI and MSI-X capabilities by protecting
> that range rather than the existing behavior where we'd drop those
> capabilities if they overlap with another capability that has already
> been placed.  There are merits to both approaches, but I don't see any
> justification here to change the current behavior.
> 
> Isn't the most similar behavior to existing to pass the available size
> to vfio_msi[x]_setup() and return an errno if the size would be
> exceeded?  Something like below (untested, and requires exporting
> msi_cap_sizeof()).  Thanks,

It only tests the beginning of the capability currently being added 
because its end is determined by vfio_std_cap_max_size() so that the 
overlap does not happen.

A comment in vfio_add_std_cap() says:
 >     /*
 >      * If it becomes important to configure capabilities to their actual
 >      * size, use this as the default when it's something we don't 
recognize.
 >      * Since QEMU doesn't actually handle many of the config accesses,
 >      * exact size doesn't seem worthwhile.
 >      */

My understanding of the problem is that while clipping is performed when 
overlapping two capabilities other than MSI and MSI-X according to the 
comment, the clipping does not happen when one of the overlapping 
capability is MSI or MSI-X.

According to that, the correct way to fix is to perform clipping also in 
such a case. As QEMU actually handles the config acccesses for MSI and 
MSI-X, MSI and MSI-X are always priotized over the other capabilities.

Regards,
Akihiko Odaki

> 
> Alex
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 939dcc3d4a9e..485f9bc5102d 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1278,11 +1278,13 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
>       }
>   }
>   
> -static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> +static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos,
> +                          uint8_t size, Error **errp)
>   {
>       uint16_t ctrl;
>       bool msi_64bit, msi_maskbit;
>       int ret, entries;
> +    uint8_t msi_cap_size;
>       Error *err = NULL;
>   
>       if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
> @@ -1295,6 +1297,10 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>       msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
>       msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
>       entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
> +    msi_cap_size = msi_cap_sizeof(ctrl);
> +
> +    if (msi_cap_size > size)
> +	    return -ENOSPC;
>   
>       trace_vfio_msi_setup(vdev->vbasedev.name, pos);
>   
> @@ -1306,7 +1312,7 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>           error_propagate_prepend(errp, err, "msi_init failed: ");
>           return ret;
>       }
> -    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
> +    vdev->msi_cap_size = msi_cap_size;
>   
>       return 0;
>   }
> @@ -1570,11 +1576,15 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>       vfio_pci_relocate_msix(vdev, errp);
>   }
>   
> -static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> +static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos,
> +                           uint8_t size, Error **errp)
>   {
>       int ret;
>       Error *err = NULL;
>   
> +    if (MSIX_CAP_LENGTH > size)
> +	    return -ENOSPC;
> +
>       vdev->msix->pending = g_new0(unsigned long,
>                                    BITS_TO_LONGS(vdev->msix->entries));
>       ret = msix_init(&vdev->pdev, vdev->msix->entries,
> @@ -2033,14 +2043,14 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
>   
>       switch (cap_id) {
>       case PCI_CAP_ID_MSI:
> -        ret = vfio_msi_setup(vdev, pos, errp);
> +        ret = vfio_msi_setup(vdev, pos, size, errp);
>           break;
>       case PCI_CAP_ID_EXP:
>           vfio_check_pcie_flr(vdev, pos);
>           ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
>           break;
>       case PCI_CAP_ID_MSIX:
> -        ret = vfio_msix_setup(vdev, pos, errp);
> +        ret = vfio_msix_setup(vdev, pos, size, errp);
>           break;
>       case PCI_CAP_ID_PM:
>           vfio_check_pm_reset(vdev, pos);
>
Alex Williamson Oct. 28, 2022, 7:23 p.m. UTC | #3
On Sat, 29 Oct 2022 01:12:11 +0900
Akihiko Odaki <akihiko.odaki@daynix.com> wrote:

> On 2022/10/28 23:16, Alex Williamson wrote:
> > On Fri, 28 Oct 2022 21:26:13 +0900
> > Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >   
> >> vfio_add_std_cap() is designed to ensure that capabilities do not
> >> overlap, but it failed to do so for MSI and MSI-X capabilities.
> >>
> >> Ensure MSI and MSI-X capabilities do not overlap with others by omitting
> >> other overlapping capabilities.
> >>
> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> >> ---
> >>   hw/vfio/pci.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------
> >>   hw/vfio/pci.h |  3 +++
> >>   2 files changed, 56 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index 939dcc3d4a..36c8f3dc85 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -1278,23 +1278,42 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
> >>       }
> >>   }
> >>   
> >> -static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> >> +static void vfio_msi_early_setup(VFIOPCIDevice *vdev, Error **errp)
> >>   {
> >>       uint16_t ctrl;
> >> -    bool msi_64bit, msi_maskbit;
> >> -    int ret, entries;
> >> -    Error *err = NULL;
> >> +    uint8_t pos;
> >> +
> >> +    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSI);
> >> +    if (!pos) {
> >> +        return;
> >> +    }
> >>   
> >>       if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
> >>                 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> >>           error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
> >> -        return -errno;
> >> +        return;
> >>       }
> >> -    ctrl = le16_to_cpu(ctrl);
> >> +    vdev->msi_pos = pos;
> >> +    vdev->msi_ctrl = le16_to_cpu(ctrl);
> >>   
> >> -    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
> >> -    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
> >> -    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
> >> +    vdev->msi_cap_size = 0xa;
> >> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT)) {
> >> +        vdev->msi_cap_size += 0xa;
> >> +    }
> >> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT)) {
> >> +        vdev->msi_cap_size += 0x4;
> >> +    }
> >> +}
> >> +
> >> +static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> >> +{
> >> +    bool msi_64bit, msi_maskbit;
> >> +    int ret, entries;
> >> +    Error *err = NULL;
> >> +
> >> +    msi_64bit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT);
> >> +    msi_maskbit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT);
> >> +    entries = 1 << ((vdev->msi_ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
> >>   
> >>       trace_vfio_msi_setup(vdev->vbasedev.name, pos);
> >>   
> >> @@ -1306,7 +1325,6 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> >>           error_propagate_prepend(errp, err, "msi_init failed: ");
> >>           return ret;
> >>       }
> >> -    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
> >>   
> >>       return 0;
> >>   }
> >> @@ -1524,6 +1542,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> >>       pba = le32_to_cpu(pba);
> >>   
> >>       msix = g_malloc0(sizeof(*msix));
> >> +    msix->pos = pos;
> >>       msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
> >>       msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> >>       msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
> >> @@ -2025,6 +2044,24 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
> >>           }
> >>       }
> >>   
> >> +    if (cap_id != PCI_CAP_ID_MSI &&
> >> +        range_covers_byte(vdev->msi_pos, vdev->msi_cap_size, pos)) {
> >> +        warn_report(VFIO_MSG_PREFIX
> >> +                    "A capability overlaps with MSI, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
> >> +                    vdev->vbasedev.name, cap_id, pos,
> >> +                    vdev->msi_pos, vdev->msi_pos + vdev->msi_cap_size);
> >> +        return 0;
> >> +    }
> >> +
> >> +    if (cap_id != PCI_CAP_ID_MSIX && vdev->msix &&
> >> +        range_covers_byte(vdev->msix->pos, MSIX_CAP_LENGTH, pos)) {
> >> +        warn_report(VFIO_MSG_PREFIX
> >> +                    "A capability overlaps with MSI-X, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
> >> +                    vdev->vbasedev.name, cap_id, pos,
> >> +                    vdev->msix->pos, vdev->msix->pos + MSIX_CAP_LENGTH);
> >> +        return 0;
> >> +    }  
> > 
> > Capabilities are not a single byte, the fact that it doesn't start
> > within the MSI or MSI-X capability is not a sufficient test.  We're
> > also choosing to prioritize MSI and MSI-X capabilities by protecting
> > that range rather than the existing behavior where we'd drop those
> > capabilities if they overlap with another capability that has already
> > been placed.  There are merits to both approaches, but I don't see any
> > justification here to change the current behavior.
> > 
> > Isn't the most similar behavior to existing to pass the available size
> > to vfio_msi[x]_setup() and return an errno if the size would be
> > exceeded?  Something like below (untested, and requires exporting
> > msi_cap_sizeof()).  Thanks,  
> 
> It only tests the beginning of the capability currently being added 
> because its end is determined by vfio_std_cap_max_size() so that the 
> overlap does not happen.
> 
> A comment in vfio_add_std_cap() says:
>  >     /*
>  >      * If it becomes important to configure capabilities to their actual
>  >      * size, use this as the default when it's something we don't   
> recognize.
>  >      * Since QEMU doesn't actually handle many of the config accesses,
>  >      * exact size doesn't seem worthwhile.
>  >      */  
> 
> My understanding of the problem is that while clipping is performed when 
> overlapping two capabilities other than MSI and MSI-X according to the 
> comment, the clipping does not happen when one of the overlapping 
> capability is MSI or MSI-X.
> 
> According to that, the correct way to fix is to perform clipping also in 
> such a case. As QEMU actually handles the config acccesses for MSI and 
> MSI-X, MSI and MSI-X are always priotized over the other capabilities.

Here's a scenario, a vendor ships a device with an MSI capability where
the MSI control register reports per vector masking, but the packing of
the capabilities is such that the next capability doesn't allow for the
mask and pending bits registers.  Currently, depending on the order we
add them, pci_add_capability() will fail for either the MSI capability
or the encroaching capability.  This failure will propagate back to
vfio_realize and we'll fail to instantiate the device.  To make this
scenario even a bit more pathological, let's assume the encroaching
capability is MSI-X.

As proposed here, we'd drop the MSI-X capability because it's starting
position lies within our expectation of the extent of the MSI
capability, and we'd allow the device to initialize with only MSI.
Was that intentional?  Was that a good choice?  What if the driver
only supports MSI-X?  We've subtly, perhaps unintentionally, changed
the policy based on some notion of prioritizing certain capabilities
over others.

The intent of vfio_std_cap_max_size() is not to intentionally
clip ranges, it's only meant to simplify defining the extent of a
capability to be bounded by the nearest capability after it in config
space.

Currently we rely on a combination of our own range management and the
overlap detection in pci_add_capability() to generate a device
instantiation failure.  If it's deemed worthwhile to remove the latter,
and that is the extent of the focus of this series, let's not go
dabbling into defining new priority schemes for capabilities and
defining certain overlap scenarios as arbitrarily continue'able.
Thanks,

Alex
Akihiko Odaki Oct. 31, 2022, 12:37 p.m. UTC | #4
On 2022/10/29 4:23, Alex Williamson wrote:
> On Sat, 29 Oct 2022 01:12:11 +0900
> Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> 
>> On 2022/10/28 23:16, Alex Williamson wrote:
>>> On Fri, 28 Oct 2022 21:26:13 +0900
>>> Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>    
>>>> vfio_add_std_cap() is designed to ensure that capabilities do not
>>>> overlap, but it failed to do so for MSI and MSI-X capabilities.
>>>>
>>>> Ensure MSI and MSI-X capabilities do not overlap with others by omitting
>>>> other overlapping capabilities.
>>>>
>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>>>> ---
>>>>    hw/vfio/pci.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------
>>>>    hw/vfio/pci.h |  3 +++
>>>>    2 files changed, 56 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 939dcc3d4a..36c8f3dc85 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -1278,23 +1278,42 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
>>>>        }
>>>>    }
>>>>    
>>>> -static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>>>> +static void vfio_msi_early_setup(VFIOPCIDevice *vdev, Error **errp)
>>>>    {
>>>>        uint16_t ctrl;
>>>> -    bool msi_64bit, msi_maskbit;
>>>> -    int ret, entries;
>>>> -    Error *err = NULL;
>>>> +    uint8_t pos;
>>>> +
>>>> +    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSI);
>>>> +    if (!pos) {
>>>> +        return;
>>>> +    }
>>>>    
>>>>        if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
>>>>                  vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
>>>>            error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
>>>> -        return -errno;
>>>> +        return;
>>>>        }
>>>> -    ctrl = le16_to_cpu(ctrl);
>>>> +    vdev->msi_pos = pos;
>>>> +    vdev->msi_ctrl = le16_to_cpu(ctrl);
>>>>    
>>>> -    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
>>>> -    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
>>>> -    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
>>>> +    vdev->msi_cap_size = 0xa;
>>>> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT)) {
>>>> +        vdev->msi_cap_size += 0xa;
>>>> +    }
>>>> +    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT)) {
>>>> +        vdev->msi_cap_size += 0x4;
>>>> +    }
>>>> +}
>>>> +
>>>> +static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>>>> +{
>>>> +    bool msi_64bit, msi_maskbit;
>>>> +    int ret, entries;
>>>> +    Error *err = NULL;
>>>> +
>>>> +    msi_64bit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT);
>>>> +    msi_maskbit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT);
>>>> +    entries = 1 << ((vdev->msi_ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
>>>>    
>>>>        trace_vfio_msi_setup(vdev->vbasedev.name, pos);
>>>>    
>>>> @@ -1306,7 +1325,6 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>>>>            error_propagate_prepend(errp, err, "msi_init failed: ");
>>>>            return ret;
>>>>        }
>>>> -    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
>>>>    
>>>>        return 0;
>>>>    }
>>>> @@ -1524,6 +1542,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>>>>        pba = le32_to_cpu(pba);
>>>>    
>>>>        msix = g_malloc0(sizeof(*msix));
>>>> +    msix->pos = pos;
>>>>        msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
>>>>        msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
>>>>        msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
>>>> @@ -2025,6 +2044,24 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
>>>>            }
>>>>        }
>>>>    
>>>> +    if (cap_id != PCI_CAP_ID_MSI &&
>>>> +        range_covers_byte(vdev->msi_pos, vdev->msi_cap_size, pos)) {
>>>> +        warn_report(VFIO_MSG_PREFIX
>>>> +                    "A capability overlaps with MSI, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
>>>> +                    vdev->vbasedev.name, cap_id, pos,
>>>> +                    vdev->msi_pos, vdev->msi_pos + vdev->msi_cap_size);
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    if (cap_id != PCI_CAP_ID_MSIX && vdev->msix &&
>>>> +        range_covers_byte(vdev->msix->pos, MSIX_CAP_LENGTH, pos)) {
>>>> +        warn_report(VFIO_MSG_PREFIX
>>>> +                    "A capability overlaps with MSI-X, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
>>>> +                    vdev->vbasedev.name, cap_id, pos,
>>>> +                    vdev->msix->pos, vdev->msix->pos + MSIX_CAP_LENGTH);
>>>> +        return 0;
>>>> +    }
>>>
>>> Capabilities are not a single byte, the fact that it doesn't start
>>> within the MSI or MSI-X capability is not a sufficient test.  We're
>>> also choosing to prioritize MSI and MSI-X capabilities by protecting
>>> that range rather than the existing behavior where we'd drop those
>>> capabilities if they overlap with another capability that has already
>>> been placed.  There are merits to both approaches, but I don't see any
>>> justification here to change the current behavior.
>>>
>>> Isn't the most similar behavior to existing to pass the available size
>>> to vfio_msi[x]_setup() and return an errno if the size would be
>>> exceeded?  Something like below (untested, and requires exporting
>>> msi_cap_sizeof()).  Thanks,
>>
>> It only tests the beginning of the capability currently being added
>> because its end is determined by vfio_std_cap_max_size() so that the
>> overlap does not happen.
>>
>> A comment in vfio_add_std_cap() says:
>>   >     /*
>>   >      * If it becomes important to configure capabilities to their actual
>>   >      * size, use this as the default when it's something we don't
>> recognize.
>>   >      * Since QEMU doesn't actually handle many of the config accesses,
>>   >      * exact size doesn't seem worthwhile.
>>   >      */
>>
>> My understanding of the problem is that while clipping is performed when
>> overlapping two capabilities other than MSI and MSI-X according to the
>> comment, the clipping does not happen when one of the overlapping
>> capability is MSI or MSI-X.
>>
>> According to that, the correct way to fix is to perform clipping also in
>> such a case. As QEMU actually handles the config acccesses for MSI and
>> MSI-X, MSI and MSI-X are always priotized over the other capabilities.
> 
> Here's a scenario, a vendor ships a device with an MSI capability where
> the MSI control register reports per vector masking, but the packing of
> the capabilities is such that the next capability doesn't allow for the
> mask and pending bits registers.  Currently, depending on the order we
> add them, pci_add_capability() will fail for either the MSI capability
> or the encroaching capability.  This failure will propagate back to
> vfio_realize and we'll fail to instantiate the device.  To make this
> scenario even a bit more pathological, let's assume the encroaching
> capability is MSI-X.
> 
> As proposed here, we'd drop the MSI-X capability because it's starting
> position lies within our expectation of the extent of the MSI
> capability, and we'd allow the device to initialize with only MSI.
> Was that intentional?  Was that a good choice?  What if the driver
> only supports MSI-X?  We've subtly, perhaps unintentionally, changed
> the policy based on some notion of prioritizing certain capabilities
> over others.
> 
> The intent of vfio_std_cap_max_size() is not to intentionally
> clip ranges, it's only meant to simplify defining the extent of a
> capability to be bounded by the nearest capability after it in config
> space.
> 
> Currently we rely on a combination of our own range management and the
> overlap detection in pci_add_capability() to generate a device
> instantiation failure.  If it's deemed worthwhile to remove the latter,
> and that is the extent of the focus of this series, let's not go
> dabbling into defining new priority schemes for capabilities and
> defining certain overlap scenarios as arbitrarily continue'able.
> Thanks,
> 
> Alex
> 

You are right. I missed the part that vfio_std_cap_max_size() is not 
intended to clip ranges. That invalidates reasoning to continue when 
MSI/MSI-X capability overlaps with another.

I have sent v6 to make the cases error. Thanks for reviewing and 
pointing this out.

Regards,
Akihiko Odaki
diff mbox series

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 939dcc3d4a..36c8f3dc85 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1278,23 +1278,42 @@  static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
     }
 }
 
-static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
+static void vfio_msi_early_setup(VFIOPCIDevice *vdev, Error **errp)
 {
     uint16_t ctrl;
-    bool msi_64bit, msi_maskbit;
-    int ret, entries;
-    Error *err = NULL;
+    uint8_t pos;
+
+    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSI);
+    if (!pos) {
+        return;
+    }
 
     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
         error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
-        return -errno;
+        return;
     }
-    ctrl = le16_to_cpu(ctrl);
+    vdev->msi_pos = pos;
+    vdev->msi_ctrl = le16_to_cpu(ctrl);
 
-    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
-    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
-    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
+    vdev->msi_cap_size = 0xa;
+    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT)) {
+        vdev->msi_cap_size += 0xa;
+    }
+    if ((vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT)) {
+        vdev->msi_cap_size += 0x4;
+    }
+}
+
+static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
+{
+    bool msi_64bit, msi_maskbit;
+    int ret, entries;
+    Error *err = NULL;
+
+    msi_64bit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_64BIT);
+    msi_maskbit = !!(vdev->msi_ctrl & PCI_MSI_FLAGS_MASKBIT);
+    entries = 1 << ((vdev->msi_ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
 
     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
 
@@ -1306,7 +1325,6 @@  static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
         error_propagate_prepend(errp, err, "msi_init failed: ");
         return ret;
     }
-    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
 
     return 0;
 }
@@ -1524,6 +1542,7 @@  static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
     pba = le32_to_cpu(pba);
 
     msix = g_malloc0(sizeof(*msix));
+    msix->pos = pos;
     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
@@ -2025,6 +2044,24 @@  static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
         }
     }
 
+    if (cap_id != PCI_CAP_ID_MSI &&
+        range_covers_byte(vdev->msi_pos, vdev->msi_cap_size, pos)) {
+        warn_report(VFIO_MSG_PREFIX
+                    "A capability overlaps with MSI, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
+                    vdev->vbasedev.name, cap_id, pos,
+                    vdev->msi_pos, vdev->msi_pos + vdev->msi_cap_size);
+        return 0;
+    }
+
+    if (cap_id != PCI_CAP_ID_MSIX && vdev->msix &&
+        range_covers_byte(vdev->msix->pos, MSIX_CAP_LENGTH, pos)) {
+        warn_report(VFIO_MSG_PREFIX
+                    "A capability overlaps with MSI-X, ignoring (%" PRIu8 " @ %" PRIu8 " in [%" PRIu8 ", %" PRIu8 "))",
+                    vdev->vbasedev.name, cap_id, pos,
+                    vdev->msix->pos, vdev->msix->pos + MSIX_CAP_LENGTH);
+        return 0;
+    }
+
     /* Scale down size, esp in case virt caps were added above */
     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
 
@@ -3037,6 +3074,12 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
 
     vfio_bars_prepare(vdev);
 
+    vfio_msi_early_setup(vdev, &err);
+    if (err) {
+        error_propagate(errp, err);
+        goto error;
+    }
+
     vfio_msix_early_setup(vdev, &err);
     if (err) {
         error_propagate(errp, err);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 7c236a52f4..9ae0278058 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -107,6 +107,7 @@  enum {
 
 /* Cache of MSI-X setup */
 typedef struct VFIOMSIXInfo {
+    uint8_t pos;
     uint8_t table_bar;
     uint8_t pba_bar;
     uint16_t entries;
@@ -128,6 +129,8 @@  struct VFIOPCIDevice {
     unsigned int rom_size;
     off_t rom_offset; /* Offset of ROM region within device fd */
     void *rom;
+    uint8_t msi_pos;
+    uint16_t msi_ctrl;
     int msi_cap_size;
     VFIOMSIVector *msi_vectors;
     VFIOMSIXInfo *msix;