diff mbox series

[for,7.2?] vhost: fix vq dirt bitmap syncing when vIOMMU is enabled

Message ID 20221122030111.4230-1-jasowang@redhat.com (mailing list archive)
State New, archived
Headers show
Series [for,7.2?] vhost: fix vq dirt bitmap syncing when vIOMMU is enabled | expand

Commit Message

Jason Wang Nov. 22, 2022, 3:01 a.m. UTC
When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
GPA. So we need to translate it to GPA before the syncing otherwise we
may hit the following crash since IOVA could be out of the scope of
the GPA log size. This could be noted when using virtio-IOMMU with
vhost using 1G memory.

Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
Cc: qemu-stable@nongnu.org
Reported-by: Yalan Zhang <yalzhang@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 20 deletions(-)

Comments

Michael S. Tsirkin Nov. 22, 2022, 9:43 a.m. UTC | #1
On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> GPA. So we need to translate it to GPA before the syncing otherwise we
> may hit the following crash since IOVA could be out of the scope of
> the GPA log size. This could be noted when using virtio-IOMMU with
> vhost using 1G memory.

Noted how exactly? What does "using 1G memory" mean?

> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> Cc: qemu-stable@nongnu.org
> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> Tested-by: Eric Auger <eric.auger@redhat.com>
> Tested-by: Lei Yang <leiyang@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
>  1 file changed, 45 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index d1c4c20b8c..26b319f34e 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
>      }
>  }
>  
> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> +{
> +    VirtIODevice *vdev = dev->vdev;
> +
> +    /*
> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> +     * incremental memory mapping API via IOTLB API. For platform that
> +     * does not have IOMMU, there's no need to enable this feature
> +     * which may cause unnecessary IOTLB miss/update transactions.
> +     */
> +    if (vdev) {
> +        return virtio_bus_device_iommu_enabled(vdev) &&
> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> +    } else {
> +        return false;
> +    }
> +}
> +
>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>                                     MemoryRegionSection *section,
>                                     hwaddr first,
>                                     hwaddr last)
>  {
> +    IOMMUTLBEntry iotlb;

why don't we move this inside the scope where it's used?

>      int i;
>      hwaddr start_addr;
>      hwaddr end_addr;
> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>      }
>      for (i = 0; i < dev->nvqs; ++i) {
>          struct vhost_virtqueue *vq = dev->vqs + i;
> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> +        hwaddr phys, s;

these two, too.

>  
>          if (!vq->used_phys && !vq->used_size) {
>              continue;
>          }
>  
> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> -                              range_get_last(vq->used_phys, vq->used_size));
> +        if (vhost_dev_has_iommu(dev)) {
> +            while (used_size) {
> +                rcu_read_lock();
> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> +                                                      used_phys,
> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> +                rcu_read_unlock();
> +
> +                if (iotlb.target_as == NULL) {
> +                    return -EINVAL;

I am not sure how this can trigger. I don't like == NULL:
!iotlb.target_as is more succint. But a bigger question is how to
handle this. callers ignore the return value so maybe
log guest error? iommu seems misconfigured ...


> +                }
> +
> +                phys = iotlb.translated_addr;
> +                s = MIN(iotlb.addr_mask + 1, used_size);

Note, that iotlb.translated_addr here is an aligned address and
iotlb.addr_mask + 1 is size from there. 

So I think phys that you want is actually
	phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);



accordingly, the size would be from there until end of mask:
	s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);


Also, it bothers me that the math here will give you 0 if addr_mask is
all ones. Then MIN will give 0 too and we loop forever.  I think this
can not trigger, but I'd rather we play it safe and add outside of MIN
after it's capped to a reasonable value. So we end up with:

	/* Distance from start of used ring until last byte of IOMMU page */
	s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
	/* size of used ring, or of the part of it until end of IOMMU page */
	s = MIN(s, used_size - 1) + 1;





> +
> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> +                                      range_get_last(phys, used_size));

why are you syncing used_size here? Shouldn't it be s?



> +                used_size -= s;
> +                used_phys += s;
> +            }
> +        } else {
> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> +                                  range_get_last(used_phys, used_size));
> +        }
>      }
>      return 0;
>  }
> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
>      dev->log_size = size;
>  }
>  
> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> -{
> -    VirtIODevice *vdev = dev->vdev;
> -
> -    /*
> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> -     * incremental memory mapping API via IOTLB API. For platform that
> -     * does not have IOMMU, there's no need to enable this feature
> -     * which may cause unnecessary IOTLB miss/update transactions.
> -     */
> -    if (vdev) {
> -        return virtio_bus_device_iommu_enabled(vdev) &&
> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> -    } else {
> -        return false;
> -    }
> -}
> -
>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
>                                hwaddr *plen, bool is_write)
>  {
> -- 
> 2.25.1
Eric Auger Nov. 22, 2022, 4:28 p.m. UTC | #2
Hi,

On 11/22/22 10:43, Michael S. Tsirkin wrote:
> On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
>> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
>> GPA. So we need to translate it to GPA before the syncing otherwise we
>> may hit the following crash since IOVA could be out of the scope of
>> the GPA log size. This could be noted when using virtio-IOMMU with
>> vhost using 1G memory.
> Noted how exactly? What does "using 1G memory" mean?

We hit the following assertion:

qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.

On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000

In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:

qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.



"using 1G memory": We hit the issue with a guest started with 1GB initial RAM.


>
>> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
>> Cc: qemu-stable@nongnu.org
>> Reported-by: Yalan Zhang <yalzhang@redhat.com>
>> Tested-by: Eric Auger <eric.auger@redhat.com>
>> Tested-by: Lei Yang <leiyang@redhat.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
>>  1 file changed, 45 insertions(+), 20 deletions(-)
>>
>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>> index d1c4c20b8c..26b319f34e 100644
>> --- a/hw/virtio/vhost.c
>> +++ b/hw/virtio/vhost.c
>> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
>>      }
>>  }
>>  
>> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
>> +{
>> +    VirtIODevice *vdev = dev->vdev;
>> +
>> +    /*
>> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
>> +     * incremental memory mapping API via IOTLB API. For platform that
>> +     * does not have IOMMU, there's no need to enable this feature
>> +     * which may cause unnecessary IOTLB miss/update transactions.
>> +     */
>> +    if (vdev) {
>> +        return virtio_bus_device_iommu_enabled(vdev) &&
>> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
>> +    } else {
>> +        return false;
>> +    }
>> +}
>> +
>>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>>                                     MemoryRegionSection *section,
>>                                     hwaddr first,
>>                                     hwaddr last)
>>  {
>> +    IOMMUTLBEntry iotlb;
> why don't we move this inside the scope where it's used?
>
>>      int i;
>>      hwaddr start_addr;
>>      hwaddr end_addr;
>> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>>      }
>>      for (i = 0; i < dev->nvqs; ++i) {
>>          struct vhost_virtqueue *vq = dev->vqs + i;
>> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
>> +        hwaddr phys, s;
> these two, too.
>
>>  
>>          if (!vq->used_phys && !vq->used_size) {
>>              continue;
>>          }
>>  
>> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
>> -                              range_get_last(vq->used_phys, vq->used_size));
>> +        if (vhost_dev_has_iommu(dev)) {
>> +            while (used_size) {
>> +                rcu_read_lock();
>> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
>> +                                                      used_phys,
>> +                                                      true, MEMTXATTRS_UNSPECIFIED);
>> +                rcu_read_unlock();
>> +
>> +                if (iotlb.target_as == NULL) {
>> +                    return -EINVAL;
> I am not sure how this can trigger. I don't like == NULL:
> !iotlb.target_as is more succint. But a bigger question is how to
> handle this. callers ignore the return value so maybe
> log guest error? iommu seems misconfigured ...
>
>
>> +                }
>> +
>> +                phys = iotlb.translated_addr;
>> +                s = MIN(iotlb.addr_mask + 1, used_size);
> Note, that iotlb.translated_addr here is an aligned address and
> iotlb.addr_mask + 1 is size from there. 
>
> So I think phys that you want is actually
> 	phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
>
>
>
> accordingly, the size would be from there until end of mask:
> 	s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
>
>
> Also, it bothers me that the math here will give you 0 if addr_mask is
> all ones. Then MIN will give 0 too and we loop forever.  I think this
> can not trigger, but I'd rather we play it safe and add outside of MIN
> after it's capped to a reasonable value. So we end up with:
>
> 	/* Distance from start of used ring until last byte of IOMMU page */
> 	s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> 	/* size of used ring, or of the part of it until end of IOMMU page */
> 	s = MIN(s, used_size - 1) + 1;
>
>
>
>
>
>> +
>> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
>> +                                      range_get_last(phys, used_size));
> why are you syncing used_size here? Shouldn't it be s?
>
>
>
>> +                used_size -= s;
>> +                used_phys += s;
>> +            }
>> +        } else {
>> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
>> +                                  range_get_last(used_phys, used_size));
>> +        }
>>      }
>>      return 0;
>>  }
>> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
>>      dev->log_size = size;
>>  }
>>  
>> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
>> -{
>> -    VirtIODevice *vdev = dev->vdev;
>> -
>> -    /*
>> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
>> -     * incremental memory mapping API via IOTLB API. For platform that
>> -     * does not have IOMMU, there's no need to enable this feature
>> -     * which may cause unnecessary IOTLB miss/update transactions.
>> -     */
>> -    if (vdev) {
>> -        return virtio_bus_device_iommu_enabled(vdev) &&
>> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
>> -    } else {
>> -        return false;
>> -    }
>> -}
>> -
>>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
>>                                hwaddr *plen, bool is_write)
>>  {
>> -- 
>> 2.25.1

Thanks

Eric
Jason Wang Nov. 23, 2022, 5:26 a.m. UTC | #3
On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
>
> Hi,
>
> On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> >> GPA. So we need to translate it to GPA before the syncing otherwise we
> >> may hit the following crash since IOVA could be out of the scope of
> >> the GPA log size. This could be noted when using virtio-IOMMU with
> >> vhost using 1G memory.
> > Noted how exactly? What does "using 1G memory" mean?
>
> We hit the following assertion:
>
> qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
>
> On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
>
> In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
>
> qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
>
>
>
> "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.

Yes, so in the case the guest iova allocator may try to use an IOVA
that is beyond 1G.

>
>
> >
> >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> >> Cc: qemu-stable@nongnu.org
> >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> >> Tested-by: Eric Auger <eric.auger@redhat.com>
> >> Tested-by: Lei Yang <leiyang@redhat.com>
> >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >> ---
> >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> >>  1 file changed, 45 insertions(+), 20 deletions(-)
> >>
> >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >> index d1c4c20b8c..26b319f34e 100644
> >> --- a/hw/virtio/vhost.c
> >> +++ b/hw/virtio/vhost.c
> >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> >>      }
> >>  }
> >>
> >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >> +{
> >> +    VirtIODevice *vdev = dev->vdev;
> >> +
> >> +    /*
> >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> >> +     * incremental memory mapping API via IOTLB API. For platform that
> >> +     * does not have IOMMU, there's no need to enable this feature
> >> +     * which may cause unnecessary IOTLB miss/update transactions.
> >> +     */
> >> +    if (vdev) {
> >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> >> +    } else {
> >> +        return false;
> >> +    }
> >> +}
> >> +
> >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> >>                                     MemoryRegionSection *section,
> >>                                     hwaddr first,
> >>                                     hwaddr last)
> >>  {
> >> +    IOMMUTLBEntry iotlb;
> > why don't we move this inside the scope where it's used?

That's fine.

> >
> >>      int i;
> >>      hwaddr start_addr;
> >>      hwaddr end_addr;
> >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> >>      }
> >>      for (i = 0; i < dev->nvqs; ++i) {
> >>          struct vhost_virtqueue *vq = dev->vqs + i;
> >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> >> +        hwaddr phys, s;
> > these two, too.

Right.

> >
> >>
> >>          if (!vq->used_phys && !vq->used_size) {
> >>              continue;
> >>          }
> >>
> >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> >> -                              range_get_last(vq->used_phys, vq->used_size));
> >> +        if (vhost_dev_has_iommu(dev)) {
> >> +            while (used_size) {
> >> +                rcu_read_lock();
> >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> >> +                                                      used_phys,
> >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> >> +                rcu_read_unlock();
> >> +
> >> +                if (iotlb.target_as == NULL) {
> >> +                    return -EINVAL;
> > I am not sure how this can trigger. I don't like == NULL:
> > !iotlb.target_as is more succint. But a bigger question is how to
> > handle this. callers ignore the return value so maybe
> > log guest error? iommu seems misconfigured ...

Ok.

> >
> >
> >> +                }
> >> +
> >> +                phys = iotlb.translated_addr;
> >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > Note, that iotlb.translated_addr here is an aligned address and
> > iotlb.addr_mask + 1 is size from there.
> >
> > So I think phys that you want is actually
> >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> >
> >
> >
> > accordingly, the size would be from there until end of mask:
> >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> >
> >
> > Also, it bothers me that the math here will give you 0 if addr_mask is
> > all ones. Then MIN will give 0 too and we loop forever.  I think this
> > can not trigger, but I'd rather we play it safe and add outside of MIN
> > after it's capped to a reasonable value. So we end up with:
> >
> >       /* Distance from start of used ring until last byte of IOMMU page */
> >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> >       /* size of used ring, or of the part of it until end of IOMMU page */
> >       s = MIN(s, used_size - 1) + 1;
> >
> >

Right.

> >
> >
> >
> >> +
> >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> >> +                                      range_get_last(phys, used_size));
> > why are you syncing used_size here? Shouldn't it be s?

Let me fix this.

Thanks

> >
> >
> >
> >> +                used_size -= s;
> >> +                used_phys += s;
> >> +            }
> >> +        } else {
> >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> >> +                                  range_get_last(used_phys, used_size));
> >> +        }
> >>      }
> >>      return 0;
> >>  }
> >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> >>      dev->log_size = size;
> >>  }
> >>
> >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >> -{
> >> -    VirtIODevice *vdev = dev->vdev;
> >> -
> >> -    /*
> >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> >> -     * incremental memory mapping API via IOTLB API. For platform that
> >> -     * does not have IOMMU, there's no need to enable this feature
> >> -     * which may cause unnecessary IOTLB miss/update transactions.
> >> -     */
> >> -    if (vdev) {
> >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> >> -    } else {
> >> -        return false;
> >> -    }
> >> -}
> >> -
> >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> >>                                hwaddr *plen, bool is_write)
> >>  {
> >> --
> >> 2.25.1
>
> Thanks
>
> Eric
>
Jason Wang Nov. 23, 2022, 5:47 a.m. UTC | #4
On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> >
> > Hi,
> >
> > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > >> may hit the following crash since IOVA could be out of the scope of
> > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > >> vhost using 1G memory.
> > > Noted how exactly? What does "using 1G memory" mean?
> >
> > We hit the following assertion:
> >
> > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> >
> > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> >
> > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> >
> > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> >
> >
> >
> > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
>
> Yes, so in the case the guest iova allocator may try to use an IOVA
> that is beyond 1G.
>
> >
> >
> > >
> > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > >> Cc: qemu-stable@nongnu.org
> > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >> ---
> > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > >>
> > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > >> index d1c4c20b8c..26b319f34e 100644
> > >> --- a/hw/virtio/vhost.c
> > >> +++ b/hw/virtio/vhost.c
> > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > >>      }
> > >>  }
> > >>
> > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > >> +{
> > >> +    VirtIODevice *vdev = dev->vdev;
> > >> +
> > >> +    /*
> > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > >> +     * does not have IOMMU, there's no need to enable this feature
> > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > >> +     */
> > >> +    if (vdev) {
> > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > >> +    } else {
> > >> +        return false;
> > >> +    }
> > >> +}
> > >> +
> > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > >>                                     MemoryRegionSection *section,
> > >>                                     hwaddr first,
> > >>                                     hwaddr last)
> > >>  {
> > >> +    IOMMUTLBEntry iotlb;
> > > why don't we move this inside the scope where it's used?
>
> That's fine.
>
> > >
> > >>      int i;
> > >>      hwaddr start_addr;
> > >>      hwaddr end_addr;
> > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > >>      }
> > >>      for (i = 0; i < dev->nvqs; ++i) {
> > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > >> +        hwaddr phys, s;
> > > these two, too.
>
> Right.
>
> > >
> > >>
> > >>          if (!vq->used_phys && !vq->used_size) {
> > >>              continue;
> > >>          }
> > >>
> > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > >> +        if (vhost_dev_has_iommu(dev)) {
> > >> +            while (used_size) {
> > >> +                rcu_read_lock();
> > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > >> +                                                      used_phys,
> > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > >> +                rcu_read_unlock();
> > >> +
> > >> +                if (iotlb.target_as == NULL) {
> > >> +                    return -EINVAL;
> > > I am not sure how this can trigger. I don't like == NULL:
> > > !iotlb.target_as is more succint. But a bigger question is how to
> > > handle this. callers ignore the return value so maybe
> > > log guest error? iommu seems misconfigured ...
>
> Ok.
>
> > >
> > >
> > >> +                }
> > >> +
> > >> +                phys = iotlb.translated_addr;
> > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > Note, that iotlb.translated_addr here is an aligned address and
> > > iotlb.addr_mask + 1 is size from there.
> > >
> > > So I think phys that you want is actually
> > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > >
> > >
> > >
> > > accordingly, the size would be from there until end of mask:
> > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > >
> > >
> > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > all ones.

So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
which should be fine. The used_size has been validated before to be
non-zero.

Thanks

> Then MIN will give 0 too and we loop forever.  I think this
> > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > after it's capped to a reasonable value. So we end up with:
> > >
> > >       /* Distance from start of used ring until last byte of IOMMU page */
> > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > >       s = MIN(s, used_size - 1) + 1;
> > >
> > >
>
> Right.
>
> > >
> > >
> > >
> > >> +
> > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > >> +                                      range_get_last(phys, used_size));
> > > why are you syncing used_size here? Shouldn't it be s?
>
> Let me fix this.
>
> Thanks
>
> > >
> > >
> > >
> > >> +                used_size -= s;
> > >> +                used_phys += s;
> > >> +            }
> > >> +        } else {
> > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > >> +                                  range_get_last(used_phys, used_size));
> > >> +        }
> > >>      }
> > >>      return 0;
> > >>  }
> > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > >>      dev->log_size = size;
> > >>  }
> > >>
> > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > >> -{
> > >> -    VirtIODevice *vdev = dev->vdev;
> > >> -
> > >> -    /*
> > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > >> -     * does not have IOMMU, there's no need to enable this feature
> > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > >> -     */
> > >> -    if (vdev) {
> > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > >> -    } else {
> > >> -        return false;
> > >> -    }
> > >> -}
> > >> -
> > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > >>                                hwaddr *plen, bool is_write)
> > >>  {
> > >> --
> > >> 2.25.1
> >
> > Thanks
> >
> > Eric
> >
Michael S. Tsirkin Nov. 23, 2022, 6:15 a.m. UTC | #5
On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > >
> > > Hi,
> > >
> > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > >> may hit the following crash since IOVA could be out of the scope of
> > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > >> vhost using 1G memory.
> > > > Noted how exactly? What does "using 1G memory" mean?
> > >
> > > We hit the following assertion:
> > >
> > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > >
> > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > >
> > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > >
> > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > >
> > >
> > >
> > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> >
> > Yes, so in the case the guest iova allocator may try to use an IOVA
> > that is beyond 1G.
> >
> > >
> > >
> > > >
> > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > >> Cc: qemu-stable@nongnu.org
> > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > >> ---
> > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > >>
> > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > >> index d1c4c20b8c..26b319f34e 100644
> > > >> --- a/hw/virtio/vhost.c
> > > >> +++ b/hw/virtio/vhost.c
> > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > >>      }
> > > >>  }
> > > >>
> > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > >> +{
> > > >> +    VirtIODevice *vdev = dev->vdev;
> > > >> +
> > > >> +    /*
> > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > >> +     */
> > > >> +    if (vdev) {
> > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > >> +    } else {
> > > >> +        return false;
> > > >> +    }
> > > >> +}
> > > >> +
> > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > >>                                     MemoryRegionSection *section,
> > > >>                                     hwaddr first,
> > > >>                                     hwaddr last)
> > > >>  {
> > > >> +    IOMMUTLBEntry iotlb;
> > > > why don't we move this inside the scope where it's used?
> >
> > That's fine.
> >
> > > >
> > > >>      int i;
> > > >>      hwaddr start_addr;
> > > >>      hwaddr end_addr;
> > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > >>      }
> > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > >> +        hwaddr phys, s;
> > > > these two, too.
> >
> > Right.
> >
> > > >
> > > >>
> > > >>          if (!vq->used_phys && !vq->used_size) {
> > > >>              continue;
> > > >>          }
> > > >>
> > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > >> +            while (used_size) {
> > > >> +                rcu_read_lock();
> > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > >> +                                                      used_phys,
> > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > >> +                rcu_read_unlock();
> > > >> +
> > > >> +                if (iotlb.target_as == NULL) {
> > > >> +                    return -EINVAL;
> > > > I am not sure how this can trigger. I don't like == NULL:
> > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > handle this. callers ignore the return value so maybe
> > > > log guest error? iommu seems misconfigured ...
> >
> > Ok.
> >
> > > >
> > > >
> > > >> +                }
> > > >> +
> > > >> +                phys = iotlb.translated_addr;
> > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > iotlb.addr_mask + 1 is size from there.
> > > >
> > > > So I think phys that you want is actually
> > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > >
> > > >
> > > >
> > > > accordingly, the size would be from there until end of mask:
> > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > >
> > > >
> > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > all ones.
> 
> So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> which should be fine.

How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).

> The used_size has been validated before to be
> non-zero.
> 
> Thanks
> 
> > Then MIN will give 0 too and we loop forever.  I think this
> > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > after it's capped to a reasonable value. So we end up with:
> > > >
> > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > >       s = MIN(s, used_size - 1) + 1;
> > > >
> > > >
> >
> > Right.
> >
> > > >
> > > >
> > > >
> > > >> +
> > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > >> +                                      range_get_last(phys, used_size));
> > > > why are you syncing used_size here? Shouldn't it be s?
> >
> > Let me fix this.
> >
> > Thanks
> >
> > > >
> > > >
> > > >
> > > >> +                used_size -= s;
> > > >> +                used_phys += s;
> > > >> +            }
> > > >> +        } else {
> > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > >> +                                  range_get_last(used_phys, used_size));
> > > >> +        }
> > > >>      }
> > > >>      return 0;
> > > >>  }
> > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > >>      dev->log_size = size;
> > > >>  }
> > > >>
> > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > >> -{
> > > >> -    VirtIODevice *vdev = dev->vdev;
> > > >> -
> > > >> -    /*
> > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > >> -     */
> > > >> -    if (vdev) {
> > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > >> -    } else {
> > > >> -        return false;
> > > >> -    }
> > > >> -}
> > > >> -
> > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > >>                                hwaddr *plen, bool is_write)
> > > >>  {
> > > >> --
> > > >> 2.25.1
> > >
> > > Thanks
> > >
> > > Eric
> > >
Jason Wang Nov. 23, 2022, 7:08 a.m. UTC | #6
On Wed, Nov 23, 2022 at 2:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> > On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > > >
> > > > Hi,
> > > >
> > > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > > >> may hit the following crash since IOVA could be out of the scope of
> > > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > > >> vhost using 1G memory.
> > > > > Noted how exactly? What does "using 1G memory" mean?
> > > >
> > > > We hit the following assertion:
> > > >
> > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > >
> > > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > > >
> > > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > > >
> > > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > >
> > > >
> > > >
> > > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> > >
> > > Yes, so in the case the guest iova allocator may try to use an IOVA
> > > that is beyond 1G.
> > >
> > > >
> > > >
> > > > >
> > > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > > >> Cc: qemu-stable@nongnu.org
> > > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > >> ---
> > > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > > >>
> > > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > > >> index d1c4c20b8c..26b319f34e 100644
> > > > >> --- a/hw/virtio/vhost.c
> > > > >> +++ b/hw/virtio/vhost.c
> > > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > > >>      }
> > > > >>  }
> > > > >>
> > > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > >> +{
> > > > >> +    VirtIODevice *vdev = dev->vdev;
> > > > >> +
> > > > >> +    /*
> > > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > > >> +     */
> > > > >> +    if (vdev) {
> > > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > >> +    } else {
> > > > >> +        return false;
> > > > >> +    }
> > > > >> +}
> > > > >> +
> > > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > >>                                     MemoryRegionSection *section,
> > > > >>                                     hwaddr first,
> > > > >>                                     hwaddr last)
> > > > >>  {
> > > > >> +    IOMMUTLBEntry iotlb;
> > > > > why don't we move this inside the scope where it's used?
> > >
> > > That's fine.
> > >
> > > > >
> > > > >>      int i;
> > > > >>      hwaddr start_addr;
> > > > >>      hwaddr end_addr;
> > > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > >>      }
> > > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > > >> +        hwaddr phys, s;
> > > > > these two, too.
> > >
> > > Right.
> > >
> > > > >
> > > > >>
> > > > >>          if (!vq->used_phys && !vq->used_size) {
> > > > >>              continue;
> > > > >>          }
> > > > >>
> > > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > > >> +            while (used_size) {
> > > > >> +                rcu_read_lock();
> > > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > > >> +                                                      used_phys,
> > > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > > >> +                rcu_read_unlock();
> > > > >> +
> > > > >> +                if (iotlb.target_as == NULL) {
> > > > >> +                    return -EINVAL;
> > > > > I am not sure how this can trigger. I don't like == NULL:
> > > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > > handle this. callers ignore the return value so maybe
> > > > > log guest error? iommu seems misconfigured ...
> > >
> > > Ok.
> > >
> > > > >
> > > > >
> > > > >> +                }
> > > > >> +
> > > > >> +                phys = iotlb.translated_addr;
> > > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > > iotlb.addr_mask + 1 is size from there.
> > > > >
> > > > > So I think phys that you want is actually
> > > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > > >
> > > > >
> > > > >
> > > > > accordingly, the size would be from there until end of mask:
> > > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > > >
> > > > >
> > > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > > all ones.
> >
> > So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> > which should be fine.
>
> How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).

But we have a substration after that. In order to get a zero result,
used_phys must be zero then used_phys & iotlb.addr_mask can be zero,
or anything I missed here?

The 1 is calculated via assuming used_phys are all ones.

Thanks


>
> > The used_size has been validated before to be
> > non-zero.
> >
> > Thanks
> >
> > > Then MIN will give 0 too and we loop forever.  I think this
> > > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > > after it's capped to a reasonable value. So we end up with:
> > > > >
> > > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > > >       s = MIN(s, used_size - 1) + 1;
> > > > >
> > > > >
> > >
> > > Right.
> > >
> > > > >
> > > > >
> > > > >
> > > > >> +
> > > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > > >> +                                      range_get_last(phys, used_size));
> > > > > why are you syncing used_size here? Shouldn't it be s?
> > >
> > > Let me fix this.
> > >
> > > Thanks
> > >
> > > > >
> > > > >
> > > > >
> > > > >> +                used_size -= s;
> > > > >> +                used_phys += s;
> > > > >> +            }
> > > > >> +        } else {
> > > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > > >> +                                  range_get_last(used_phys, used_size));
> > > > >> +        }
> > > > >>      }
> > > > >>      return 0;
> > > > >>  }
> > > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > > >>      dev->log_size = size;
> > > > >>  }
> > > > >>
> > > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > >> -{
> > > > >> -    VirtIODevice *vdev = dev->vdev;
> > > > >> -
> > > > >> -    /*
> > > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > > >> -     */
> > > > >> -    if (vdev) {
> > > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > >> -    } else {
> > > > >> -        return false;
> > > > >> -    }
> > > > >> -}
> > > > >> -
> > > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > > >>                                hwaddr *plen, bool is_write)
> > > > >>  {
> > > > >> --
> > > > >> 2.25.1
> > > >
> > > > Thanks
> > > >
> > > > Eric
> > > >
>
Michael S. Tsirkin Nov. 23, 2022, 7:21 a.m. UTC | #7
On Wed, Nov 23, 2022 at 03:08:25PM +0800, Jason Wang wrote:
> On Wed, Nov 23, 2022 at 2:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> > > On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > > > >> may hit the following crash since IOVA could be out of the scope of
> > > > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > > > >> vhost using 1G memory.
> > > > > > Noted how exactly? What does "using 1G memory" mean?
> > > > >
> > > > > We hit the following assertion:
> > > > >
> > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > >
> > > > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > > > >
> > > > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > > > >
> > > > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > >
> > > > >
> > > > >
> > > > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> > > >
> > > > Yes, so in the case the guest iova allocator may try to use an IOVA
> > > > that is beyond 1G.
> > > >
> > > > >
> > > > >
> > > > > >
> > > > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > > > >> Cc: qemu-stable@nongnu.org
> > > > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > >> ---
> > > > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > > > >>
> > > > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > > > >> index d1c4c20b8c..26b319f34e 100644
> > > > > >> --- a/hw/virtio/vhost.c
> > > > > >> +++ b/hw/virtio/vhost.c
> > > > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > > > >>      }
> > > > > >>  }
> > > > > >>
> > > > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > >> +{
> > > > > >> +    VirtIODevice *vdev = dev->vdev;
> > > > > >> +
> > > > > >> +    /*
> > > > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > >> +     */
> > > > > >> +    if (vdev) {
> > > > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > >> +    } else {
> > > > > >> +        return false;
> > > > > >> +    }
> > > > > >> +}
> > > > > >> +
> > > > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > >>                                     MemoryRegionSection *section,
> > > > > >>                                     hwaddr first,
> > > > > >>                                     hwaddr last)
> > > > > >>  {
> > > > > >> +    IOMMUTLBEntry iotlb;
> > > > > > why don't we move this inside the scope where it's used?
> > > >
> > > > That's fine.
> > > >
> > > > > >
> > > > > >>      int i;
> > > > > >>      hwaddr start_addr;
> > > > > >>      hwaddr end_addr;
> > > > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > >>      }
> > > > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > > > >> +        hwaddr phys, s;
> > > > > > these two, too.
> > > >
> > > > Right.
> > > >
> > > > > >
> > > > > >>
> > > > > >>          if (!vq->used_phys && !vq->used_size) {
> > > > > >>              continue;
> > > > > >>          }
> > > > > >>
> > > > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > > > >> +            while (used_size) {
> > > > > >> +                rcu_read_lock();
> > > > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > > > >> +                                                      used_phys,
> > > > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > > > >> +                rcu_read_unlock();
> > > > > >> +
> > > > > >> +                if (iotlb.target_as == NULL) {
> > > > > >> +                    return -EINVAL;
> > > > > > I am not sure how this can trigger. I don't like == NULL:
> > > > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > > > handle this. callers ignore the return value so maybe
> > > > > > log guest error? iommu seems misconfigured ...
> > > >
> > > > Ok.
> > > >
> > > > > >
> > > > > >
> > > > > >> +                }
> > > > > >> +
> > > > > >> +                phys = iotlb.translated_addr;
> > > > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > > > iotlb.addr_mask + 1 is size from there.
> > > > > >
> > > > > > So I think phys that you want is actually
> > > > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > > > >
> > > > > >
> > > > > >
> > > > > > accordingly, the size would be from there until end of mask:
> > > > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > > > >
> > > > > >
> > > > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > > > all ones.
> > >
> > > So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> > > which should be fine.
> >
> > How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).
> 
> But we have a substration after that.


I don't get it. This is your code:
               s = MIN(iotlb.addr_mask + 1, used_size);

if addr_mask is all ones s becomes 0. Then while loop will loop forever.

> In order to get a zero result,
> used_phys must be zero then used_phys & iotlb.addr_mask can be zero,
> or anything I missed here?
> 
> The 1 is calculated via assuming used_phys are all ones.
> 
> Thanks

Not used_phys, I'm talking about addr_mask being all ones.


> 
> >
> > > The used_size has been validated before to be
> > > non-zero.
> > >
> > > Thanks
> > >
> > > > Then MIN will give 0 too and we loop forever.  I think this
> > > > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > > > after it's capped to a reasonable value. So we end up with:
> > > > > >
> > > > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > > > >       s = MIN(s, used_size - 1) + 1;
> > > > > >
> > > > > >
> > > >
> > > > Right.
> > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > >> +
> > > > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > > > >> +                                      range_get_last(phys, used_size));
> > > > > > why are you syncing used_size here? Shouldn't it be s?
> > > >
> > > > Let me fix this.
> > > >
> > > > Thanks
> > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > >> +                used_size -= s;
> > > > > >> +                used_phys += s;
> > > > > >> +            }
> > > > > >> +        } else {
> > > > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > > > >> +                                  range_get_last(used_phys, used_size));
> > > > > >> +        }
> > > > > >>      }
> > > > > >>      return 0;
> > > > > >>  }
> > > > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > > > >>      dev->log_size = size;
> > > > > >>  }
> > > > > >>
> > > > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > >> -{
> > > > > >> -    VirtIODevice *vdev = dev->vdev;
> > > > > >> -
> > > > > >> -    /*
> > > > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > >> -     */
> > > > > >> -    if (vdev) {
> > > > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > >> -    } else {
> > > > > >> -        return false;
> > > > > >> -    }
> > > > > >> -}
> > > > > >> -
> > > > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > > > >>                                hwaddr *plen, bool is_write)
> > > > > >>  {
> > > > > >> --
> > > > > >> 2.25.1
> > > > >
> > > > > Thanks
> > > > >
> > > > > Eric
> > > > >
> >
Jason Wang Nov. 24, 2022, 4:12 a.m. UTC | #8
On Wed, Nov 23, 2022 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Nov 23, 2022 at 03:08:25PM +0800, Jason Wang wrote:
> > On Wed, Nov 23, 2022 at 2:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> > > > On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > > > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > > > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > > > > >> may hit the following crash since IOVA could be out of the scope of
> > > > > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > > > > >> vhost using 1G memory.
> > > > > > > Noted how exactly? What does "using 1G memory" mean?
> > > > > >
> > > > > > We hit the following assertion:
> > > > > >
> > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > >
> > > > > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > > > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > > > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > > > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > > > > >
> > > > > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > > > > >
> > > > > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > >
> > > > > >
> > > > > >
> > > > > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> > > > >
> > > > > Yes, so in the case the guest iova allocator may try to use an IOVA
> > > > > that is beyond 1G.
> > > > >
> > > > > >
> > > > > >
> > > > > > >
> > > > > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > > > > >> Cc: qemu-stable@nongnu.org
> > > > > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > > > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > > > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > > > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > >> ---
> > > > > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > > > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > > > > >>
> > > > > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > > > > >> index d1c4c20b8c..26b319f34e 100644
> > > > > > >> --- a/hw/virtio/vhost.c
> > > > > > >> +++ b/hw/virtio/vhost.c
> > > > > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > > > > >>      }
> > > > > > >>  }
> > > > > > >>
> > > > > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > >> +{
> > > > > > >> +    VirtIODevice *vdev = dev->vdev;
> > > > > > >> +
> > > > > > >> +    /*
> > > > > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > > > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > >> +     */
> > > > > > >> +    if (vdev) {
> > > > > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > >> +    } else {
> > > > > > >> +        return false;
> > > > > > >> +    }
> > > > > > >> +}
> > > > > > >> +
> > > > > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > >>                                     MemoryRegionSection *section,
> > > > > > >>                                     hwaddr first,
> > > > > > >>                                     hwaddr last)
> > > > > > >>  {
> > > > > > >> +    IOMMUTLBEntry iotlb;
> > > > > > > why don't we move this inside the scope where it's used?
> > > > >
> > > > > That's fine.
> > > > >
> > > > > > >
> > > > > > >>      int i;
> > > > > > >>      hwaddr start_addr;
> > > > > > >>      hwaddr end_addr;
> > > > > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > >>      }
> > > > > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > > > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > > > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > > > > >> +        hwaddr phys, s;
> > > > > > > these two, too.
> > > > >
> > > > > Right.
> > > > >
> > > > > > >
> > > > > > >>
> > > > > > >>          if (!vq->used_phys && !vq->used_size) {
> > > > > > >>              continue;
> > > > > > >>          }
> > > > > > >>
> > > > > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > > > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > > > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > > > > >> +            while (used_size) {
> > > > > > >> +                rcu_read_lock();
> > > > > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > > > > >> +                                                      used_phys,
> > > > > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > > > > >> +                rcu_read_unlock();
> > > > > > >> +
> > > > > > >> +                if (iotlb.target_as == NULL) {
> > > > > > >> +                    return -EINVAL;
> > > > > > > I am not sure how this can trigger. I don't like == NULL:
> > > > > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > > > > handle this. callers ignore the return value so maybe
> > > > > > > log guest error? iommu seems misconfigured ...
> > > > >
> > > > > Ok.
> > > > >
> > > > > > >
> > > > > > >
> > > > > > >> +                }
> > > > > > >> +
> > > > > > >> +                phys = iotlb.translated_addr;
> > > > > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > > > > iotlb.addr_mask + 1 is size from there.
> > > > > > >
> > > > > > > So I think phys that you want is actually
> > > > > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > accordingly, the size would be from there until end of mask:
> > > > > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > > > > >
> > > > > > >
> > > > > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > > > > all ones.
> > > >
> > > > So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> > > > which should be fine.
> > >
> > > How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).
> >
> > But we have a substration after that.
>
>
> I don't get it. This is your code:
>                s = MIN(iotlb.addr_mask + 1, used_size);
>
> if addr_mask is all ones s becomes 0. Then while loop will loop forever.

Ok, I think there's some misunderstanding here. I meant using:

s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);

Won't result in a zero for s. So there's no need to move the plus one
outside the MIN.

Thanks

>
> > In order to get a zero result,
> > used_phys must be zero then used_phys & iotlb.addr_mask can be zero,
> > or anything I missed here?
> >
> > The 1 is calculated via assuming used_phys are all ones.
> >
> > Thanks
>
> Not used_phys, I'm talking about addr_mask being all ones.
>
>
> >
> > >
> > > > The used_size has been validated before to be
> > > > non-zero.
> > > >
> > > > Thanks
> > > >
> > > > > Then MIN will give 0 too and we loop forever.  I think this
> > > > > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > > > > after it's capped to a reasonable value. So we end up with:
> > > > > > >
> > > > > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > > > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > > > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > > > > >       s = MIN(s, used_size - 1) + 1;
> > > > > > >
> > > > > > >
> > > > >
> > > > > Right.
> > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >> +
> > > > > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > > > > >> +                                      range_get_last(phys, used_size));
> > > > > > > why are you syncing used_size here? Shouldn't it be s?
> > > > >
> > > > > Let me fix this.
> > > > >
> > > > > Thanks
> > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >> +                used_size -= s;
> > > > > > >> +                used_phys += s;
> > > > > > >> +            }
> > > > > > >> +        } else {
> > > > > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > > > > >> +                                  range_get_last(used_phys, used_size));
> > > > > > >> +        }
> > > > > > >>      }
> > > > > > >>      return 0;
> > > > > > >>  }
> > > > > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > > > > >>      dev->log_size = size;
> > > > > > >>  }
> > > > > > >>
> > > > > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > >> -{
> > > > > > >> -    VirtIODevice *vdev = dev->vdev;
> > > > > > >> -
> > > > > > >> -    /*
> > > > > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > > > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > >> -     */
> > > > > > >> -    if (vdev) {
> > > > > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > >> -    } else {
> > > > > > >> -        return false;
> > > > > > >> -    }
> > > > > > >> -}
> > > > > > >> -
> > > > > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > > > > >>                                hwaddr *plen, bool is_write)
> > > > > > >>  {
> > > > > > >> --
> > > > > > >> 2.25.1
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > Eric
> > > > > >
> > >
>
Michael S. Tsirkin Nov. 24, 2022, 7:06 a.m. UTC | #9
On Thu, Nov 24, 2022 at 12:12:15PM +0800, Jason Wang wrote:
> On Wed, Nov 23, 2022 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Nov 23, 2022 at 03:08:25PM +0800, Jason Wang wrote:
> > > On Wed, Nov 23, 2022 at 2:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> > > > > On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > > > > > >
> > > > > > > Hi,
> > > > > > >
> > > > > > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > > > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > > > > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > > > > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > > > > > >> may hit the following crash since IOVA could be out of the scope of
> > > > > > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > > > > > >> vhost using 1G memory.
> > > > > > > > Noted how exactly? What does "using 1G memory" mean?
> > > > > > >
> > > > > > > We hit the following assertion:
> > > > > > >
> > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > >
> > > > > > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > > > > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > > > > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > > > > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > > > > > >
> > > > > > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > > > > > >
> > > > > > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> > > > > >
> > > > > > Yes, so in the case the guest iova allocator may try to use an IOVA
> > > > > > that is beyond 1G.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > > > > > >> Cc: qemu-stable@nongnu.org
> > > > > > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > > > > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > > > > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > > > > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > >> ---
> > > > > > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > > > > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > > > > > >>
> > > > > > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > > > > > >> index d1c4c20b8c..26b319f34e 100644
> > > > > > > >> --- a/hw/virtio/vhost.c
> > > > > > > >> +++ b/hw/virtio/vhost.c
> > > > > > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > > > > > >>      }
> > > > > > > >>  }
> > > > > > > >>
> > > > > > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > >> +{
> > > > > > > >> +    VirtIODevice *vdev = dev->vdev;
> > > > > > > >> +
> > > > > > > >> +    /*
> > > > > > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > > > > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > >> +     */
> > > > > > > >> +    if (vdev) {
> > > > > > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > >> +    } else {
> > > > > > > >> +        return false;
> > > > > > > >> +    }
> > > > > > > >> +}
> > > > > > > >> +
> > > > > > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > >>                                     MemoryRegionSection *section,
> > > > > > > >>                                     hwaddr first,
> > > > > > > >>                                     hwaddr last)
> > > > > > > >>  {
> > > > > > > >> +    IOMMUTLBEntry iotlb;
> > > > > > > > why don't we move this inside the scope where it's used?
> > > > > >
> > > > > > That's fine.
> > > > > >
> > > > > > > >
> > > > > > > >>      int i;
> > > > > > > >>      hwaddr start_addr;
> > > > > > > >>      hwaddr end_addr;
> > > > > > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > >>      }
> > > > > > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > > > > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > > > > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > > > > > >> +        hwaddr phys, s;
> > > > > > > > these two, too.
> > > > > >
> > > > > > Right.
> > > > > >
> > > > > > > >
> > > > > > > >>
> > > > > > > >>          if (!vq->used_phys && !vq->used_size) {
> > > > > > > >>              continue;
> > > > > > > >>          }
> > > > > > > >>
> > > > > > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > > > > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > > > > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > > > > > >> +            while (used_size) {
> > > > > > > >> +                rcu_read_lock();
> > > > > > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > > > > > >> +                                                      used_phys,
> > > > > > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > > > > > >> +                rcu_read_unlock();
> > > > > > > >> +
> > > > > > > >> +                if (iotlb.target_as == NULL) {
> > > > > > > >> +                    return -EINVAL;
> > > > > > > > I am not sure how this can trigger. I don't like == NULL:
> > > > > > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > > > > > handle this. callers ignore the return value so maybe
> > > > > > > > log guest error? iommu seems misconfigured ...
> > > > > >
> > > > > > Ok.
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >> +                }
> > > > > > > >> +
> > > > > > > >> +                phys = iotlb.translated_addr;
> > > > > > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > > > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > > > > > iotlb.addr_mask + 1 is size from there.
> > > > > > > >
> > > > > > > > So I think phys that you want is actually
> > > > > > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > accordingly, the size would be from there until end of mask:
> > > > > > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > > > > > >
> > > > > > > >
> > > > > > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > > > > > all ones.
> > > > >
> > > > > So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> > > > > which should be fine.
> > > >
> > > > How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).
> > >
> > > But we have a substration after that.
> >
> >
> > I don't get it. This is your code:
> >                s = MIN(iotlb.addr_mask + 1, used_size);
> >
> > if addr_mask is all ones s becomes 0. Then while loop will loop forever.
> 
> Ok, I think there's some misunderstanding here. I meant using:
> 
> s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> 
> Won't result in a zero for s.

If used_phys is page aligned then I think it will.

> So there's no need to move the plus one
> outside the MIN.
> 
> Thanks

I mean why don't you try?

[mst@tuck ~]$ cat > a.c
#include <stdio.h>
#define  MIN(a,b) (a<b ? a : b)

int main(int argc, char **argv)
{
   unsigned mask = 0xffffffff;
   unsigned s = MIN(mask + 1, 100);
   printf("s=0x%x\n",s);
}
[mst@tuck ~]$ gcc a.c
[mst@tuck ~]$ ./a.out 
s=0x0
[mst@tuck ~]$ 






> >
> > > In order to get a zero result,
> > > used_phys must be zero then used_phys & iotlb.addr_mask can be zero,
> > > or anything I missed here?
> > >
> > > The 1 is calculated via assuming used_phys are all ones.
> > >
> > > Thanks
> >
> > Not used_phys, I'm talking about addr_mask being all ones.
> >
> >
> > >
> > > >
> > > > > The used_size has been validated before to be
> > > > > non-zero.
> > > > >
> > > > > Thanks
> > > > >
> > > > > > Then MIN will give 0 too and we loop forever.  I think this
> > > > > > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > > > > > after it's capped to a reasonable value. So we end up with:
> > > > > > > >
> > > > > > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > > > > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > > > > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > > > > > >       s = MIN(s, used_size - 1) + 1;
> > > > > > > >
> > > > > > > >
> > > > > >
> > > > > > Right.
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >> +
> > > > > > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > > > > > >> +                                      range_get_last(phys, used_size));
> > > > > > > > why are you syncing used_size here? Shouldn't it be s?
> > > > > >
> > > > > > Let me fix this.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >> +                used_size -= s;
> > > > > > > >> +                used_phys += s;
> > > > > > > >> +            }
> > > > > > > >> +        } else {
> > > > > > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > > > > > >> +                                  range_get_last(used_phys, used_size));
> > > > > > > >> +        }
> > > > > > > >>      }
> > > > > > > >>      return 0;
> > > > > > > >>  }
> > > > > > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > > > > > >>      dev->log_size = size;
> > > > > > > >>  }
> > > > > > > >>
> > > > > > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > >> -{
> > > > > > > >> -    VirtIODevice *vdev = dev->vdev;
> > > > > > > >> -
> > > > > > > >> -    /*
> > > > > > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > > > > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > >> -     */
> > > > > > > >> -    if (vdev) {
> > > > > > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > >> -    } else {
> > > > > > > >> -        return false;
> > > > > > > >> -    }
> > > > > > > >> -}
> > > > > > > >> -
> > > > > > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > > > > > >>                                hwaddr *plen, bool is_write)
> > > > > > > >>  {
> > > > > > > >> --
> > > > > > > >> 2.25.1
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > Eric
> > > > > > >
> > > >
> >
Jason Wang Nov. 24, 2022, 7:31 a.m. UTC | #10
On Thu, Nov 24, 2022 at 3:06 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Nov 24, 2022 at 12:12:15PM +0800, Jason Wang wrote:
> > On Wed, Nov 23, 2022 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Nov 23, 2022 at 03:08:25PM +0800, Jason Wang wrote:
> > > > On Wed, Nov 23, 2022 at 2:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> > > > > > On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > > > > > > >
> > > > > > > > Hi,
> > > > > > > >
> > > > > > > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > > > > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > > > > > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > > > > > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > > > > > > >> may hit the following crash since IOVA could be out of the scope of
> > > > > > > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > > > > > > >> vhost using 1G memory.
> > > > > > > > > Noted how exactly? What does "using 1G memory" mean?
> > > > > > > >
> > > > > > > > We hit the following assertion:
> > > > > > > >
> > > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > > >
> > > > > > > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > > > > > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > > > > > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > > > > > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > > > > > > >
> > > > > > > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > > > > > > >
> > > > > > > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> > > > > > >
> > > > > > > Yes, so in the case the guest iova allocator may try to use an IOVA
> > > > > > > that is beyond 1G.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > > > > > > >> Cc: qemu-stable@nongnu.org
> > > > > > > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > > > > > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > > > > > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > > > > > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > >> ---
> > > > > > > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > > > > > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > > > > > > >>
> > > > > > > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > > > > > > >> index d1c4c20b8c..26b319f34e 100644
> > > > > > > > >> --- a/hw/virtio/vhost.c
> > > > > > > > >> +++ b/hw/virtio/vhost.c
> > > > > > > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > > > > > > >>      }
> > > > > > > > >>  }
> > > > > > > > >>
> > > > > > > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > > >> +{
> > > > > > > > >> +    VirtIODevice *vdev = dev->vdev;
> > > > > > > > >> +
> > > > > > > > >> +    /*
> > > > > > > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > > > > > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > > >> +     */
> > > > > > > > >> +    if (vdev) {
> > > > > > > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > > >> +    } else {
> > > > > > > > >> +        return false;
> > > > > > > > >> +    }
> > > > > > > > >> +}
> > > > > > > > >> +
> > > > > > > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > > >>                                     MemoryRegionSection *section,
> > > > > > > > >>                                     hwaddr first,
> > > > > > > > >>                                     hwaddr last)
> > > > > > > > >>  {
> > > > > > > > >> +    IOMMUTLBEntry iotlb;
> > > > > > > > > why don't we move this inside the scope where it's used?
> > > > > > >
> > > > > > > That's fine.
> > > > > > >
> > > > > > > > >
> > > > > > > > >>      int i;
> > > > > > > > >>      hwaddr start_addr;
> > > > > > > > >>      hwaddr end_addr;
> > > > > > > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > > >>      }
> > > > > > > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > > > > > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > > > > > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > > > > > > >> +        hwaddr phys, s;
> > > > > > > > > these two, too.
> > > > > > >
> > > > > > > Right.
> > > > > > >
> > > > > > > > >
> > > > > > > > >>
> > > > > > > > >>          if (!vq->used_phys && !vq->used_size) {
> > > > > > > > >>              continue;
> > > > > > > > >>          }
> > > > > > > > >>
> > > > > > > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > > > > > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > > > > > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > > > > > > >> +            while (used_size) {
> > > > > > > > >> +                rcu_read_lock();
> > > > > > > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > > > > > > >> +                                                      used_phys,
> > > > > > > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > > > > > > >> +                rcu_read_unlock();
> > > > > > > > >> +
> > > > > > > > >> +                if (iotlb.target_as == NULL) {
> > > > > > > > >> +                    return -EINVAL;
> > > > > > > > > I am not sure how this can trigger. I don't like == NULL:
> > > > > > > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > > > > > > handle this. callers ignore the return value so maybe
> > > > > > > > > log guest error? iommu seems misconfigured ...
> > > > > > >
> > > > > > > Ok.
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >> +                }
> > > > > > > > >> +
> > > > > > > > >> +                phys = iotlb.translated_addr;
> > > > > > > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > > > > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > > > > > > iotlb.addr_mask + 1 is size from there.
> > > > > > > > >
> > > > > > > > > So I think phys that you want is actually
> > > > > > > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > accordingly, the size would be from there until end of mask:
> > > > > > > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > > > > > > all ones.
> > > > > >
> > > > > > So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> > > > > > which should be fine.
> > > > >
> > > > > How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).
> > > >
> > > > But we have a substration after that.
> > >
> > >
> > > I don't get it. This is your code:
> > >                s = MIN(iotlb.addr_mask + 1, used_size);
> > >
> > > if addr_mask is all ones s becomes 0. Then while loop will loop forever.
> >
> > Ok, I think there's some misunderstanding here. I meant using:
> >
> > s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> >
> > Won't result in a zero for s.
>
> If used_phys is page aligned then I think it will.

So iotlb.addr_mask + 1 is zero, used_phys & iotlb.addr_mask is used_phys:

# cat t.c
#include <stdio.h>

int main(void)
{
unsigned int mask = 0xFFFFFFFF;
unsigned long long used_phys = 0x1000;
unsigned long long s = mask + 1 - (used_phys & mask);

fprintf(stderr, "result is %llx\n", s);

return 0;
}
# gcc t.c
# ./a.out
result is fffffffffffff000

>
> > So there's no need to move the plus one
> > outside the MIN.
> >
> > Thanks
>
> I mean why don't you try?

There should be some misunderstanding. I fully understand the
MIN(mask+1, used_size) case. My comments were for

MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);

And used_phys can't be zero here.

Thanks

>
> [mst@tuck ~]$ cat > a.c
> #include <stdio.h>
> #define  MIN(a,b) (a<b ? a : b)
>
> int main(int argc, char **argv)
> {
>    unsigned mask = 0xffffffff;
>    unsigned s = MIN(mask + 1, 100);
>    printf("s=0x%x\n",s);
> }
> [mst@tuck ~]$ gcc a.c
> [mst@tuck ~]$ ./a.out
> s=0x0
> [mst@tuck ~]$
>
>
>
>
>
>
> > >
> > > > In order to get a zero result,
> > > > used_phys must be zero then used_phys & iotlb.addr_mask can be zero,
> > > > or anything I missed here?
> > > >
> > > > The 1 is calculated via assuming used_phys are all ones.
> > > >
> > > > Thanks
> > >
> > > Not used_phys, I'm talking about addr_mask being all ones.
> > >
> > >
> > > >
> > > > >
> > > > > > The used_size has been validated before to be
> > > > > > non-zero.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > > Then MIN will give 0 too and we loop forever.  I think this
> > > > > > > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > > > > > > after it's capped to a reasonable value. So we end up with:
> > > > > > > > >
> > > > > > > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > > > > > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > > > > > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > > > > > > >       s = MIN(s, used_size - 1) + 1;
> > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > > > > Right.
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >> +
> > > > > > > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > > > > > > >> +                                      range_get_last(phys, used_size));
> > > > > > > > > why are you syncing used_size here? Shouldn't it be s?
> > > > > > >
> > > > > > > Let me fix this.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >> +                used_size -= s;
> > > > > > > > >> +                used_phys += s;
> > > > > > > > >> +            }
> > > > > > > > >> +        } else {
> > > > > > > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > > > > > > >> +                                  range_get_last(used_phys, used_size));
> > > > > > > > >> +        }
> > > > > > > > >>      }
> > > > > > > > >>      return 0;
> > > > > > > > >>  }
> > > > > > > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > > > > > > >>      dev->log_size = size;
> > > > > > > > >>  }
> > > > > > > > >>
> > > > > > > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > > >> -{
> > > > > > > > >> -    VirtIODevice *vdev = dev->vdev;
> > > > > > > > >> -
> > > > > > > > >> -    /*
> > > > > > > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > > > > > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > > >> -     */
> > > > > > > > >> -    if (vdev) {
> > > > > > > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > > >> -    } else {
> > > > > > > > >> -        return false;
> > > > > > > > >> -    }
> > > > > > > > >> -}
> > > > > > > > >> -
> > > > > > > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > > > > > > >>                                hwaddr *plen, bool is_write)
> > > > > > > > >>  {
> > > > > > > > >> --
> > > > > > > > >> 2.25.1
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > Eric
> > > > > > > >
> > > > >
> > >
>
Michael S. Tsirkin Nov. 24, 2022, 7:41 a.m. UTC | #11
On Thu, Nov 24, 2022 at 03:31:59PM +0800, Jason Wang wrote:
> On Thu, Nov 24, 2022 at 3:06 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Thu, Nov 24, 2022 at 12:12:15PM +0800, Jason Wang wrote:
> > > On Wed, Nov 23, 2022 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Nov 23, 2022 at 03:08:25PM +0800, Jason Wang wrote:
> > > > > On Wed, Nov 23, 2022 at 2:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> > > > > > > On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > Hi,
> > > > > > > > >
> > > > > > > > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > > > > > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > > > > > > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > > > > > > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > > > > > > > >> may hit the following crash since IOVA could be out of the scope of
> > > > > > > > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > > > > > > > >> vhost using 1G memory.
> > > > > > > > > > Noted how exactly? What does "using 1G memory" mean?
> > > > > > > > >
> > > > > > > > > We hit the following assertion:
> > > > > > > > >
> > > > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > > > >
> > > > > > > > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > > > > > > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > > > > > > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > > > > > > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > > > > > > > >
> > > > > > > > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > > > > > > > >
> > > > > > > > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> > > > > > > >
> > > > > > > > Yes, so in the case the guest iova allocator may try to use an IOVA
> > > > > > > > that is beyond 1G.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > > > > > > > >> Cc: qemu-stable@nongnu.org
> > > > > > > > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > > > > > > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > > > > > > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > > > > > > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > >> ---
> > > > > > > > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > > > > > > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > > > > > > > >>
> > > > > > > > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > > > > > > > >> index d1c4c20b8c..26b319f34e 100644
> > > > > > > > > >> --- a/hw/virtio/vhost.c
> > > > > > > > > >> +++ b/hw/virtio/vhost.c
> > > > > > > > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > > > > > > > >>      }
> > > > > > > > > >>  }
> > > > > > > > > >>
> > > > > > > > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > > > >> +{
> > > > > > > > > >> +    VirtIODevice *vdev = dev->vdev;
> > > > > > > > > >> +
> > > > > > > > > >> +    /*
> > > > > > > > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > > > > > > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > > > >> +     */
> > > > > > > > > >> +    if (vdev) {
> > > > > > > > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > > > >> +    } else {
> > > > > > > > > >> +        return false;
> > > > > > > > > >> +    }
> > > > > > > > > >> +}
> > > > > > > > > >> +
> > > > > > > > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > > > >>                                     MemoryRegionSection *section,
> > > > > > > > > >>                                     hwaddr first,
> > > > > > > > > >>                                     hwaddr last)
> > > > > > > > > >>  {
> > > > > > > > > >> +    IOMMUTLBEntry iotlb;
> > > > > > > > > > why don't we move this inside the scope where it's used?
> > > > > > > >
> > > > > > > > That's fine.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >>      int i;
> > > > > > > > > >>      hwaddr start_addr;
> > > > > > > > > >>      hwaddr end_addr;
> > > > > > > > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > > > >>      }
> > > > > > > > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > > > > > > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > > > > > > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > > > > > > > >> +        hwaddr phys, s;
> > > > > > > > > > these two, too.
> > > > > > > >
> > > > > > > > Right.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >>
> > > > > > > > > >>          if (!vq->used_phys && !vq->used_size) {
> > > > > > > > > >>              continue;
> > > > > > > > > >>          }
> > > > > > > > > >>
> > > > > > > > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > > > > > > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > > > > > > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > > > > > > > >> +            while (used_size) {
> > > > > > > > > >> +                rcu_read_lock();
> > > > > > > > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > > > > > > > >> +                                                      used_phys,
> > > > > > > > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > > > > > > > >> +                rcu_read_unlock();
> > > > > > > > > >> +
> > > > > > > > > >> +                if (iotlb.target_as == NULL) {
> > > > > > > > > >> +                    return -EINVAL;
> > > > > > > > > > I am not sure how this can trigger. I don't like == NULL:
> > > > > > > > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > > > > > > > handle this. callers ignore the return value so maybe
> > > > > > > > > > log guest error? iommu seems misconfigured ...
> > > > > > > >
> > > > > > > > Ok.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> +                }
> > > > > > > > > >> +
> > > > > > > > > >> +                phys = iotlb.translated_addr;
> > > > > > > > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > > > > > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > > > > > > > iotlb.addr_mask + 1 is size from there.
> > > > > > > > > >
> > > > > > > > > > So I think phys that you want is actually
> > > > > > > > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > accordingly, the size would be from there until end of mask:
> > > > > > > > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > > > > > > > all ones.
> > > > > > >
> > > > > > > So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> > > > > > > which should be fine.
> > > > > >
> > > > > > How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).
> > > > >
> > > > > But we have a substration after that.
> > > >
> > > >
> > > > I don't get it. This is your code:
> > > >                s = MIN(iotlb.addr_mask + 1, used_size);
> > > >
> > > > if addr_mask is all ones s becomes 0. Then while loop will loop forever.
> > >
> > > Ok, I think there's some misunderstanding here. I meant using:
> > >
> > > s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > >
> > > Won't result in a zero for s.
> >
> > If used_phys is page aligned then I think it will.
> 
> So iotlb.addr_mask + 1 is zero, used_phys & iotlb.addr_mask is used_phys:
> 
> # cat t.c
> #include <stdio.h>
> 
> int main(void)
> {
> unsigned int mask = 0xFFFFFFFF;
> unsigned long long used_phys = 0x1000;
> unsigned long long s = mask + 1 - (used_phys & mask);
> 
> fprintf(stderr, "result is %llx\n", s);
> 
> return 0;
> }
> # gcc t.c
> # ./a.out
> result is fffffffffffff000

Oh I see. So it will only trigger if used_phys is 0.
Unlikely, but I'd prefer we don't rely on this, seems fragile.


> >
> > > So there's no need to move the plus one
> > > outside the MIN.
> > >
> > > Thanks
> >
> > I mean why don't you try?
> 
> There should be some misunderstanding. I fully understand the
> MIN(mask+1, used_size) case. My comments were for
> 
> MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> 
> And used_phys can't be zero here.
> 
> Thanks

Right. IMHO + 1 outside is more robust.

> >
> > [mst@tuck ~]$ cat > a.c
> > #include <stdio.h>
> > #define  MIN(a,b) (a<b ? a : b)
> >
> > int main(int argc, char **argv)
> > {
> >    unsigned mask = 0xffffffff;
> >    unsigned s = MIN(mask + 1, 100);
> >    printf("s=0x%x\n",s);
> > }
> > [mst@tuck ~]$ gcc a.c
> > [mst@tuck ~]$ ./a.out
> > s=0x0
> > [mst@tuck ~]$
> >
> >
> >
> >
> >
> >
> > > >
> > > > > In order to get a zero result,
> > > > > used_phys must be zero then used_phys & iotlb.addr_mask can be zero,
> > > > > or anything I missed here?
> > > > >
> > > > > The 1 is calculated via assuming used_phys are all ones.
> > > > >
> > > > > Thanks
> > > >
> > > > Not used_phys, I'm talking about addr_mask being all ones.
> > > >
> > > >
> > > > >
> > > > > >
> > > > > > > The used_size has been validated before to be
> > > > > > > non-zero.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > > Then MIN will give 0 too and we loop forever.  I think this
> > > > > > > > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > > > > > > > after it's capped to a reasonable value. So we end up with:
> > > > > > > > > >
> > > > > > > > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > > > > > > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > > > > > > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > > > > > > > >       s = MIN(s, used_size - 1) + 1;
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > >
> > > > > > > > Right.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> +
> > > > > > > > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > > > > > > > >> +                                      range_get_last(phys, used_size));
> > > > > > > > > > why are you syncing used_size here? Shouldn't it be s?
> > > > > > > >
> > > > > > > > Let me fix this.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> +                used_size -= s;
> > > > > > > > > >> +                used_phys += s;
> > > > > > > > > >> +            }
> > > > > > > > > >> +        } else {
> > > > > > > > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > > > > > > > >> +                                  range_get_last(used_phys, used_size));
> > > > > > > > > >> +        }
> > > > > > > > > >>      }
> > > > > > > > > >>      return 0;
> > > > > > > > > >>  }
> > > > > > > > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > > > > > > > >>      dev->log_size = size;
> > > > > > > > > >>  }
> > > > > > > > > >>
> > > > > > > > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > > > >> -{
> > > > > > > > > >> -    VirtIODevice *vdev = dev->vdev;
> > > > > > > > > >> -
> > > > > > > > > >> -    /*
> > > > > > > > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > > > > > > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > > > >> -     */
> > > > > > > > > >> -    if (vdev) {
> > > > > > > > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > > > >> -    } else {
> > > > > > > > > >> -        return false;
> > > > > > > > > >> -    }
> > > > > > > > > >> -}
> > > > > > > > > >> -
> > > > > > > > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > > > > > > > >>                                hwaddr *plen, bool is_write)
> > > > > > > > > >>  {
> > > > > > > > > >> --
> > > > > > > > > >> 2.25.1
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > Eric
> > > > > > > > >
> > > > > >
> > > >
> >
Jason Wang Nov. 24, 2022, 7:58 a.m. UTC | #12
On Thu, Nov 24, 2022 at 3:41 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Nov 24, 2022 at 03:31:59PM +0800, Jason Wang wrote:
> > On Thu, Nov 24, 2022 at 3:06 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Thu, Nov 24, 2022 at 12:12:15PM +0800, Jason Wang wrote:
> > > > On Wed, Nov 23, 2022 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Nov 23, 2022 at 03:08:25PM +0800, Jason Wang wrote:
> > > > > > On Wed, Nov 23, 2022 at 2:15 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Wed, Nov 23, 2022 at 01:47:04PM +0800, Jason Wang wrote:
> > > > > > > > On Wed, Nov 23, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Nov 23, 2022 at 12:28 AM Eric Auger <eric.auger@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > Hi,
> > > > > > > > > >
> > > > > > > > > > On 11/22/22 10:43, Michael S. Tsirkin wrote:
> > > > > > > > > > > On Tue, Nov 22, 2022 at 11:01:11AM +0800, Jason Wang wrote:
> > > > > > > > > > >> When vIOMMU is enabled, the vq->used_phys is actually the IOVA not
> > > > > > > > > > >> GPA. So we need to translate it to GPA before the syncing otherwise we
> > > > > > > > > > >> may hit the following crash since IOVA could be out of the scope of
> > > > > > > > > > >> the GPA log size. This could be noted when using virtio-IOMMU with
> > > > > > > > > > >> vhost using 1G memory.
> > > > > > > > > > > Noted how exactly? What does "using 1G memory" mean?
> > > > > > > > > >
> > > > > > > > > > We hit the following assertion:
> > > > > > > > > >
> > > > > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > > > > >
> > > > > > > > > > On the last time vhost_get_log_size() is called it takes into account 2 regions when computing the log_size:
> > > > > > > > > > qemu-system-x86_64: vhost_get_log_size region 0 last=0x9ffff updated log_size=0x3
> > > > > > > > > > qemu-system-x86_64: vhost_get_log_size region 1 last=0x3fffffff updated log_size=0x1000
> > > > > > > > > > so in vhost_migration_log() vhost_get_log_size(dev) returns 0x1000
> > > > > > > > > >
> > > > > > > > > > In the test case, memory_region_sync_dirty_bitmap() gets called for mem-machine_mem, vga.vram (several times) and eventually on pc.bios. This latter is reponsible for the assertion:
> > > > > > > > > >
> > > > > > > > > > qemu-system-x86_64: vhost_log_sync calls sync_dirty_map on pc.bios for the full range
> > > > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 0
> > > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x9ffff < start=0xfffc0000
> > > > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on region 1
> > > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region end=0x3fffffff < start=0xfffc0000
> > > > > > > > > > qemu-system-x86_64: vhost_sync_dirty_bitmap calls vhost_dev_sync_region on vq 0 <-----
> > > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios mfirst=0xfffc0000 mlast=0xffffffff rfirst=0xfffff240 rlast=0xfffffa45
> > > > > > > > > > qemu-system-x86_64: vhost_dev_sync_region pc.bios end=0xfffffa45 VHOST_LOG_CHUNK=0x40000 end/VHOST_LOG_CHUNK=0x3fff dev->log_size=0x1000
> > > > > > > > > > qemu-system-x86_64: ../hw/virtio/vhost.c:85: vhost_dev_sync_region: Assertion `end / VHOST_LOG_CHUNK < dev->log_size' failed.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > "using 1G memory": We hit the issue with a guest started with 1GB initial RAM.
> > > > > > > > >
> > > > > > > > > Yes, so in the case the guest iova allocator may try to use an IOVA
> > > > > > > > > that is beyond 1G.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >> Fixes: c471ad0e9bd46 ("vhost_net: device IOTLB support")
> > > > > > > > > > >> Cc: qemu-stable@nongnu.org
> > > > > > > > > > >> Reported-by: Yalan Zhang <yalzhang@redhat.com>
> > > > > > > > > > >> Tested-by: Eric Auger <eric.auger@redhat.com>
> > > > > > > > > > >> Tested-by: Lei Yang <leiyang@redhat.com>
> > > > > > > > > > >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > >> ---
> > > > > > > > > > >>  hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++++++++---------------
> > > > > > > > > > >>  1 file changed, 45 insertions(+), 20 deletions(-)
> > > > > > > > > > >>
> > > > > > > > > > >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > > > > > > > > > >> index d1c4c20b8c..26b319f34e 100644
> > > > > > > > > > >> --- a/hw/virtio/vhost.c
> > > > > > > > > > >> +++ b/hw/virtio/vhost.c
> > > > > > > > > > >> @@ -106,11 +106,30 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
> > > > > > > > > > >>      }
> > > > > > > > > > >>  }
> > > > > > > > > > >>
> > > > > > > > > > >> +static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > > > > >> +{
> > > > > > > > > > >> +    VirtIODevice *vdev = dev->vdev;
> > > > > > > > > > >> +
> > > > > > > > > > >> +    /*
> > > > > > > > > > >> +     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > > > > >> +     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > > > > >> +     * does not have IOMMU, there's no need to enable this feature
> > > > > > > > > > >> +     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > > > > >> +     */
> > > > > > > > > > >> +    if (vdev) {
> > > > > > > > > > >> +        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > > > > >> +            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > > > > >> +    } else {
> > > > > > > > > > >> +        return false;
> > > > > > > > > > >> +    }
> > > > > > > > > > >> +}
> > > > > > > > > > >> +
> > > > > > > > > > >>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > > > > >>                                     MemoryRegionSection *section,
> > > > > > > > > > >>                                     hwaddr first,
> > > > > > > > > > >>                                     hwaddr last)
> > > > > > > > > > >>  {
> > > > > > > > > > >> +    IOMMUTLBEntry iotlb;
> > > > > > > > > > > why don't we move this inside the scope where it's used?
> > > > > > > > >
> > > > > > > > > That's fine.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >>      int i;
> > > > > > > > > > >>      hwaddr start_addr;
> > > > > > > > > > >>      hwaddr end_addr;
> > > > > > > > > > >> @@ -132,13 +151,37 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> > > > > > > > > > >>      }
> > > > > > > > > > >>      for (i = 0; i < dev->nvqs; ++i) {
> > > > > > > > > > >>          struct vhost_virtqueue *vq = dev->vqs + i;
> > > > > > > > > > >> +        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
> > > > > > > > > > >> +        hwaddr phys, s;
> > > > > > > > > > > these two, too.
> > > > > > > > >
> > > > > > > > > Right.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >>
> > > > > > > > > > >>          if (!vq->used_phys && !vq->used_size) {
> > > > > > > > > > >>              continue;
> > > > > > > > > > >>          }
> > > > > > > > > > >>
> > > > > > > > > > >> -        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
> > > > > > > > > > >> -                              range_get_last(vq->used_phys, vq->used_size));
> > > > > > > > > > >> +        if (vhost_dev_has_iommu(dev)) {
> > > > > > > > > > >> +            while (used_size) {
> > > > > > > > > > >> +                rcu_read_lock();
> > > > > > > > > > >> +                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> > > > > > > > > > >> +                                                      used_phys,
> > > > > > > > > > >> +                                                      true, MEMTXATTRS_UNSPECIFIED);
> > > > > > > > > > >> +                rcu_read_unlock();
> > > > > > > > > > >> +
> > > > > > > > > > >> +                if (iotlb.target_as == NULL) {
> > > > > > > > > > >> +                    return -EINVAL;
> > > > > > > > > > > I am not sure how this can trigger. I don't like == NULL:
> > > > > > > > > > > !iotlb.target_as is more succint. But a bigger question is how to
> > > > > > > > > > > handle this. callers ignore the return value so maybe
> > > > > > > > > > > log guest error? iommu seems misconfigured ...
> > > > > > > > >
> > > > > > > > > Ok.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >> +                }
> > > > > > > > > > >> +
> > > > > > > > > > >> +                phys = iotlb.translated_addr;
> > > > > > > > > > >> +                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > > > > > > > > Note, that iotlb.translated_addr here is an aligned address and
> > > > > > > > > > > iotlb.addr_mask + 1 is size from there.
> > > > > > > > > > >
> > > > > > > > > > > So I think phys that you want is actually
> > > > > > > > > > >       phys = iotlb.translated_addr + (used_phys & iotlb.addr_mask);
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > accordingly, the size would be from there until end of mask:
> > > > > > > > > > >       s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Also, it bothers me that the math here will give you 0 if addr_mask is
> > > > > > > > > > > all ones.
> > > > > > > >
> > > > > > > > So even if addr_mask is all ones, we end up with s = MIN(1, used_size)
> > > > > > > > which should be fine.
> > > > > > >
> > > > > > > How do you figure? addr_mask is all ones, addr_mask + 1 is 0, we get MIN(0, used_size).
> > > > > >
> > > > > > But we have a substration after that.
> > > > >
> > > > >
> > > > > I don't get it. This is your code:
> > > > >                s = MIN(iotlb.addr_mask + 1, used_size);
> > > > >
> > > > > if addr_mask is all ones s becomes 0. Then while loop will loop forever.
> > > >
> > > > Ok, I think there's some misunderstanding here. I meant using:
> > > >
> > > > s = MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> > > >
> > > > Won't result in a zero for s.
> > >
> > > If used_phys is page aligned then I think it will.
> >
> > So iotlb.addr_mask + 1 is zero, used_phys & iotlb.addr_mask is used_phys:
> >
> > # cat t.c
> > #include <stdio.h>
> >
> > int main(void)
> > {
> > unsigned int mask = 0xFFFFFFFF;
> > unsigned long long used_phys = 0x1000;
> > unsigned long long s = mask + 1 - (used_phys & mask);
> >
> > fprintf(stderr, "result is %llx\n", s);
> >
> > return 0;
> > }
> > # gcc t.c
> > # ./a.out
> > result is fffffffffffff000
>
> Oh I see. So it will only trigger if used_phys is 0.
> Unlikely, but I'd prefer we don't rely on this, seems fragile.

Ok.

>
>
> > >
> > > > So there's no need to move the plus one
> > > > outside the MIN.
> > > >
> > > > Thanks
> > >
> > > I mean why don't you try?
> >
> > There should be some misunderstanding. I fully understand the
> > MIN(mask+1, used_size) case. My comments were for
> >
> > MIN(iotlb.addr_mask + 1 - (used_phys & iotlb.addr_mask), used_size);
> >
> > And used_phys can't be zero here.
> >
> > Thanks
>
> Right. IMHO + 1 outside is more robust.

Ok. Will do.

Thanks

>
> > >
> > > [mst@tuck ~]$ cat > a.c
> > > #include <stdio.h>
> > > #define  MIN(a,b) (a<b ? a : b)
> > >
> > > int main(int argc, char **argv)
> > > {
> > >    unsigned mask = 0xffffffff;
> > >    unsigned s = MIN(mask + 1, 100);
> > >    printf("s=0x%x\n",s);
> > > }
> > > [mst@tuck ~]$ gcc a.c
> > > [mst@tuck ~]$ ./a.out
> > > s=0x0
> > > [mst@tuck ~]$
> > >
> > >
> > >
> > >
> > >
> > >
> > > > >
> > > > > > In order to get a zero result,
> > > > > > used_phys must be zero then used_phys & iotlb.addr_mask can be zero,
> > > > > > or anything I missed here?
> > > > > >
> > > > > > The 1 is calculated via assuming used_phys are all ones.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Not used_phys, I'm talking about addr_mask being all ones.
> > > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > The used_size has been validated before to be
> > > > > > > > non-zero.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > > Then MIN will give 0 too and we loop forever.  I think this
> > > > > > > > > > > can not trigger, but I'd rather we play it safe and add outside of MIN
> > > > > > > > > > > after it's capped to a reasonable value. So we end up with:
> > > > > > > > > > >
> > > > > > > > > > >       /* Distance from start of used ring until last byte of IOMMU page */
> > > > > > > > > > >       s = iotlb.addr_mask - (used_phys & iotlb.addr_mask);
> > > > > > > > > > >       /* size of used ring, or of the part of it until end of IOMMU page */
> > > > > > > > > > >       s = MIN(s, used_size - 1) + 1;
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Right.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >> +
> > > > > > > > > > >> +                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
> > > > > > > > > > >> +                                      range_get_last(phys, used_size));
> > > > > > > > > > > why are you syncing used_size here? Shouldn't it be s?
> > > > > > > > >
> > > > > > > > > Let me fix this.
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >> +                used_size -= s;
> > > > > > > > > > >> +                used_phys += s;
> > > > > > > > > > >> +            }
> > > > > > > > > > >> +        } else {
> > > > > > > > > > >> +            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
> > > > > > > > > > >> +                                  range_get_last(used_phys, used_size));
> > > > > > > > > > >> +        }
> > > > > > > > > > >>      }
> > > > > > > > > > >>      return 0;
> > > > > > > > > > >>  }
> > > > > > > > > > >> @@ -306,24 +349,6 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
> > > > > > > > > > >>      dev->log_size = size;
> > > > > > > > > > >>  }
> > > > > > > > > > >>
> > > > > > > > > > >> -static bool vhost_dev_has_iommu(struct vhost_dev *dev)
> > > > > > > > > > >> -{
> > > > > > > > > > >> -    VirtIODevice *vdev = dev->vdev;
> > > > > > > > > > >> -
> > > > > > > > > > >> -    /*
> > > > > > > > > > >> -     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
> > > > > > > > > > >> -     * incremental memory mapping API via IOTLB API. For platform that
> > > > > > > > > > >> -     * does not have IOMMU, there's no need to enable this feature
> > > > > > > > > > >> -     * which may cause unnecessary IOTLB miss/update transactions.
> > > > > > > > > > >> -     */
> > > > > > > > > > >> -    if (vdev) {
> > > > > > > > > > >> -        return virtio_bus_device_iommu_enabled(vdev) &&
> > > > > > > > > > >> -            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> > > > > > > > > > >> -    } else {
> > > > > > > > > > >> -        return false;
> > > > > > > > > > >> -    }
> > > > > > > > > > >> -}
> > > > > > > > > > >> -
> > > > > > > > > > >>  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> > > > > > > > > > >>                                hwaddr *plen, bool is_write)
> > > > > > > > > > >>  {
> > > > > > > > > > >> --
> > > > > > > > > > >> 2.25.1
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > Eric
> > > > > > > > > >
> > > > > > >
> > > > >
> > >
>
diff mbox series

Patch

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index d1c4c20b8c..26b319f34e 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -106,11 +106,30 @@  static void vhost_dev_sync_region(struct vhost_dev *dev,
     }
 }
 
+static bool vhost_dev_has_iommu(struct vhost_dev *dev)
+{
+    VirtIODevice *vdev = dev->vdev;
+
+    /*
+     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
+     * incremental memory mapping API via IOTLB API. For platform that
+     * does not have IOMMU, there's no need to enable this feature
+     * which may cause unnecessary IOTLB miss/update transactions.
+     */
+    if (vdev) {
+        return virtio_bus_device_iommu_enabled(vdev) &&
+            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+    } else {
+        return false;
+    }
+}
+
 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
                                    MemoryRegionSection *section,
                                    hwaddr first,
                                    hwaddr last)
 {
+    IOMMUTLBEntry iotlb;
     int i;
     hwaddr start_addr;
     hwaddr end_addr;
@@ -132,13 +151,37 @@  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
     }
     for (i = 0; i < dev->nvqs; ++i) {
         struct vhost_virtqueue *vq = dev->vqs + i;
+        hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
+        hwaddr phys, s;
 
         if (!vq->used_phys && !vq->used_size) {
             continue;
         }
 
-        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
-                              range_get_last(vq->used_phys, vq->used_size));
+        if (vhost_dev_has_iommu(dev)) {
+            while (used_size) {
+                rcu_read_lock();
+                iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
+                                                      used_phys,
+                                                      true, MEMTXATTRS_UNSPECIFIED);
+                rcu_read_unlock();
+
+                if (iotlb.target_as == NULL) {
+                    return -EINVAL;
+                }
+
+                phys = iotlb.translated_addr;
+                s = MIN(iotlb.addr_mask + 1, used_size);
+
+                vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
+                                      range_get_last(phys, used_size));
+                used_size -= s;
+                used_phys += s;
+            }
+        } else {
+            vhost_dev_sync_region(dev, section, start_addr, end_addr, used_phys,
+                                  range_get_last(used_phys, used_size));
+        }
     }
     return 0;
 }
@@ -306,24 +349,6 @@  static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
     dev->log_size = size;
 }
 
-static bool vhost_dev_has_iommu(struct vhost_dev *dev)
-{
-    VirtIODevice *vdev = dev->vdev;
-
-    /*
-     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
-     * incremental memory mapping API via IOTLB API. For platform that
-     * does not have IOMMU, there's no need to enable this feature
-     * which may cause unnecessary IOTLB miss/update transactions.
-     */
-    if (vdev) {
-        return virtio_bus_device_iommu_enabled(vdev) &&
-            virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
-    } else {
-        return false;
-    }
-}
-
 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
                               hwaddr *plen, bool is_write)
 {