diff mbox series

[1/1] hw/arm/virt: Support for virtio-mem-pci

Message ID 20211130003328.201270-2-gshan@redhat.com (mailing list archive)
State New, archived
Headers show
Series hw/arm/virt: Support for virtio-mem-pci | expand

Commit Message

Gavin Shan Nov. 30, 2021, 12:33 a.m. UTC
This supports virtio-mem-pci device on "virt" platform, by simply
following the implementation on x86.

   * The patch was written by David Hildenbrand <david@redhat.com>
     modified by Jonathan Cameron <Jonathan.Cameron@huawei.com>

   * This implements the hotplug handlers to support virtio-mem-pci
     device hot-add, while the hot-remove isn't supported as we have
     on x86.

   * The block size is 1GB on ARM64 instead of 128MB on x86.

   * It has been passing the tests with various combinations like 64KB
     and 4KB page sizes on host and guest, different memory device
     backends like normal, transparent huge page and HugeTLB, plus
     migration.

Signed-off-by: Gavin Shan <gshan@redhat.com>
---
 hw/arm/Kconfig         |  1 +
 hw/arm/virt.c          | 68 +++++++++++++++++++++++++++++++++++++++++-
 hw/virtio/virtio-mem.c |  2 ++
 3 files changed, 70 insertions(+), 1 deletion(-)

Comments

David Hildenbrand Nov. 30, 2021, 9:37 a.m. UTC | #1
On 30.11.21 01:33, Gavin Shan wrote:
> This supports virtio-mem-pci device on "virt" platform, by simply
> following the implementation on x86.

Thanks for picking this up!

> 
>    * The patch was written by David Hildenbrand <david@redhat.com>
>      modified by Jonathan Cameron <Jonathan.Cameron@huawei.com>

Maybe replace this section by

Co-developed-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> 
>    * This implements the hotplug handlers to support virtio-mem-pci
>      device hot-add, while the hot-remove isn't supported as we have
>      on x86.
> 
>    * The block size is 1GB on ARM64 instead of 128MB on x86.

See below, isn't it actually 512 MiB nowadays?

> 
>    * It has been passing the tests with various combinations like 64KB
>      and 4KB page sizes on host and guest, different memory device
>      backends like normal, transparent huge page and HugeTLB, plus
>      migration.

Perfect. A note that hugetlbfs isn't fully supported/safe to use until
we have preallocation support in QEMU (WIP).

> 
> Signed-off-by: Gavin Shan <gshan@redhat.com>
> ---
>  hw/arm/Kconfig         |  1 +
>  hw/arm/virt.c          | 68 +++++++++++++++++++++++++++++++++++++++++-
>  hw/virtio/virtio-mem.c |  2 ++
>  3 files changed, 70 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
> index 2d37d29f02..15aff8efb8 100644
> --- a/hw/arm/Kconfig
> +++ b/hw/arm/Kconfig
> @@ -27,6 +27,7 @@ config ARM_VIRT
>      select DIMM
>      select ACPI_HW_REDUCED
>      select ACPI_APEI
> +    select VIRTIO_MEM_SUPPORTED
>  
>  config CHEETAH
>      bool
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 369552ad45..f4599a5ef0 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -71,9 +71,11 @@
>  #include "hw/arm/smmuv3.h"
>  #include "hw/acpi/acpi.h"
>  #include "target/arm/internals.h"
> +#include "hw/mem/memory-device.h"
>  #include "hw/mem/pc-dimm.h"
>  #include "hw/mem/nvdimm.h"
>  #include "hw/acpi/generic_event_device.h"
> +#include "hw/virtio/virtio-mem-pci.h"
>  #include "hw/virtio/virtio-iommu.h"
>  #include "hw/char/pl011.h"
>  #include "qemu/guest-random.h"
> @@ -2480,6 +2482,63 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
>                           dev, &error_abort);
>  }
>  
> +static void virt_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev,
> +                                        DeviceState *dev, Error **errp)
> +{
> +    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
> +    Error *local_err = NULL;
> +
> +    if (!hotplug_dev2 && dev->hotplugged) {
> +        /*
> +         * Without a bus hotplug handler, we cannot control the plug/unplug
> +         * order. We should never reach this point when hotplugging on x86,
> +         * however, better add a safety net.
> +         */
> +        error_setg(errp, "hotplug of virtio based memory devices not supported"
> +                   " on this bus.");
> +        return;
> +    }
> +    /*
> +     * First, see if we can plug this memory device at all. If that
> +     * succeeds, branch of to the actual hotplug handler.
> +     */
> +    memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
> +                           &local_err);
> +    if (!local_err && hotplug_dev2) {
> +        hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err);
> +    }
> +    error_propagate(errp, local_err);
> +}
> +
> +static void virt_virtio_md_pci_plug(HotplugHandler *hotplug_dev,
> +                                    DeviceState *dev, Error **errp)
> +{
> +    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
> +    Error *local_err = NULL;
> +
> +    /*
> +     * Plug the memory device first and then branch off to the actual
> +     * hotplug handler. If that one fails, we can easily undo the memory
> +     * device bits.
> +     */
> +    memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
> +    if (hotplug_dev2) {
> +        hotplug_handler_plug(hotplug_dev2, dev, &local_err);
> +        if (local_err) {
> +            memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
> +        }
> +    }
> +    error_propagate(errp, local_err);
> +}
> +
> +static void virt_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev,
> +                                              DeviceState *dev, Error **errp)
> +{
> +    /* We don't support hot unplug of virtio based memory devices */
> +    error_setg(errp, "virtio based memory devices cannot be unplugged.");
> +}
> +
> +
>  static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
>                                              DeviceState *dev, Error **errp)
>  {
> @@ -2513,6 +2572,8 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
>          qdev_prop_set_uint32(dev, "len-reserved-regions", 1);
>          qdev_prop_set_string(dev, "reserved-regions[0]", resv_prop_str);
>          g_free(resv_prop_str);
> +    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
> +        virt_virtio_md_pci_pre_plug(hotplug_dev, dev, errp);
>      }
>  }
>  
> @@ -2538,6 +2599,8 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
>          vms->iommu = VIRT_IOMMU_VIRTIO;
>          vms->virtio_iommu_bdf = pci_get_bdf(pdev);
>          create_virtio_iommu_dt_bindings(vms);
> +    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
> +        virt_virtio_md_pci_plug(hotplug_dev, dev, errp);
>      }
>  }
>  
> @@ -2588,6 +2651,8 @@ static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
>  {
>      if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
>          virt_dimm_unplug_request(hotplug_dev, dev, errp);
> +    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
> +        virt_virtio_md_pci_unplug_request(hotplug_dev, dev, errp);
>      } else {
>          error_setg(errp, "device unplug request for unsupported device"
>                     " type: %s", object_get_typename(OBJECT(dev)));
> @@ -2611,7 +2676,8 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
>      MachineClass *mc = MACHINE_GET_CLASS(machine);
>  
>      if (device_is_dynamic_sysbus(mc, dev) ||
> -       (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM))) {
> +        object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
> +        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
>          return HOTPLUG_HANDLER(machine);
>      }
>      if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index d5a578142b..3033692a83 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -126,6 +126,8 @@ static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
>   */
>  #if defined(TARGET_X86_64) || defined(TARGET_I386)
>  #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
> +#elif defined(TARGET_ARM)
> +#define VIRTIO_MEM_USABLE_EXTENT (2 * (1024 * MiB))


Can we make this 512 MiB ?

arch/arm64/include/asm/sparsemem.h

/*
 * Section size must be at least 512MB for 64K base
 * page size config. Otherwise it will be less than
 * (MAX_ORDER - 1) and the build process will fail.
 */
#ifdef CONFIG_ARM64_64K_PAGES
#define SECTION_SIZE_BITS 29

#else

/*
 * Section size must be at least 128MB for 4K base
 * page size config. Otherwise PMD based huge page
 * entries could not be created for vmemmap mappings.
 * 16K follows 4K for simplicity.
 */
#define SECTION_SIZE_BITS 27
#endif /* CONFIG_ARM64_64K_PAGES */


Apart from that, LGTM -- thanks!
Gavin Shan Dec. 1, 2021, 5:09 a.m. UTC | #2
On 11/30/21 8:37 PM, David Hildenbrand wrote:
> On 30.11.21 01:33, Gavin Shan wrote:
>> This supports virtio-mem-pci device on "virt" platform, by simply
>> following the implementation on x86.
> 
> Thanks for picking this up!
> 

Thanks, David.

>>
>>     * The patch was written by David Hildenbrand <david@redhat.com>
>>       modified by Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> Maybe replace this section by
> 
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 

Yes, it will be included into v2.

>>
>>     * This implements the hotplug handlers to support virtio-mem-pci
>>       device hot-add, while the hot-remove isn't supported as we have
>>       on x86.
>>
>>     * The block size is 1GB on ARM64 instead of 128MB on x86.
> 
> See below, isn't it actually 512 MiB nowadays?
> 

I think so.

>>
>>     * It has been passing the tests with various combinations like 64KB
>>       and 4KB page sizes on host and guest, different memory device
>>       backends like normal, transparent huge page and HugeTLB, plus
>>       migration.
> 
> Perfect. A note that hugetlbfs isn't fully supported/safe to use until
> we have preallocation support in QEMU (WIP).
> 

Yes, there is some warnings raised to enlarge 'request-size' on
host with 64KB page size. Note that the memory backends I used
in the testings always have "prealloc=on" property though.

>>
>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>> ---
>>   hw/arm/Kconfig         |  1 +
>>   hw/arm/virt.c          | 68 +++++++++++++++++++++++++++++++++++++++++-
>>   hw/virtio/virtio-mem.c |  2 ++
>>   3 files changed, 70 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
>> index 2d37d29f02..15aff8efb8 100644
>> --- a/hw/arm/Kconfig
>> +++ b/hw/arm/Kconfig
>> @@ -27,6 +27,7 @@ config ARM_VIRT
>>       select DIMM
>>       select ACPI_HW_REDUCED
>>       select ACPI_APEI
>> +    select VIRTIO_MEM_SUPPORTED
>>   
>>   config CHEETAH
>>       bool
>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>> index 369552ad45..f4599a5ef0 100644
>> --- a/hw/arm/virt.c
>> +++ b/hw/arm/virt.c
>> @@ -71,9 +71,11 @@
>>   #include "hw/arm/smmuv3.h"
>>   #include "hw/acpi/acpi.h"
>>   #include "target/arm/internals.h"
>> +#include "hw/mem/memory-device.h"
>>   #include "hw/mem/pc-dimm.h"
>>   #include "hw/mem/nvdimm.h"
>>   #include "hw/acpi/generic_event_device.h"
>> +#include "hw/virtio/virtio-mem-pci.h"
>>   #include "hw/virtio/virtio-iommu.h"
>>   #include "hw/char/pl011.h"
>>   #include "qemu/guest-random.h"
>> @@ -2480,6 +2482,63 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
>>                            dev, &error_abort);
>>   }
>>   
>> +static void virt_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev,
>> +                                        DeviceState *dev, Error **errp)
>> +{
>> +    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
>> +    Error *local_err = NULL;
>> +
>> +    if (!hotplug_dev2 && dev->hotplugged) {
>> +        /*
>> +         * Without a bus hotplug handler, we cannot control the plug/unplug
>> +         * order. We should never reach this point when hotplugging on x86,
>> +         * however, better add a safety net.
>> +         */
>> +        error_setg(errp, "hotplug of virtio based memory devices not supported"
>> +                   " on this bus.");
>> +        return;
>> +    }
>> +    /*
>> +     * First, see if we can plug this memory device at all. If that
>> +     * succeeds, branch of to the actual hotplug handler.
>> +     */
>> +    memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
>> +                           &local_err);
>> +    if (!local_err && hotplug_dev2) {
>> +        hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err);
>> +    }
>> +    error_propagate(errp, local_err);
>> +}
>> +
>> +static void virt_virtio_md_pci_plug(HotplugHandler *hotplug_dev,
>> +                                    DeviceState *dev, Error **errp)
>> +{
>> +    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
>> +    Error *local_err = NULL;
>> +
>> +    /*
>> +     * Plug the memory device first and then branch off to the actual
>> +     * hotplug handler. If that one fails, we can easily undo the memory
>> +     * device bits.
>> +     */
>> +    memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
>> +    if (hotplug_dev2) {
>> +        hotplug_handler_plug(hotplug_dev2, dev, &local_err);
>> +        if (local_err) {
>> +            memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
>> +        }
>> +    }
>> +    error_propagate(errp, local_err);
>> +}
>> +
>> +static void virt_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev,
>> +                                              DeviceState *dev, Error **errp)
>> +{
>> +    /* We don't support hot unplug of virtio based memory devices */
>> +    error_setg(errp, "virtio based memory devices cannot be unplugged.");
>> +}
>> +
>> +
>>   static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
>>                                               DeviceState *dev, Error **errp)
>>   {
>> @@ -2513,6 +2572,8 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
>>           qdev_prop_set_uint32(dev, "len-reserved-regions", 1);
>>           qdev_prop_set_string(dev, "reserved-regions[0]", resv_prop_str);
>>           g_free(resv_prop_str);
>> +    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
>> +        virt_virtio_md_pci_pre_plug(hotplug_dev, dev, errp);
>>       }
>>   }
>>   
>> @@ -2538,6 +2599,8 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
>>           vms->iommu = VIRT_IOMMU_VIRTIO;
>>           vms->virtio_iommu_bdf = pci_get_bdf(pdev);
>>           create_virtio_iommu_dt_bindings(vms);
>> +    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
>> +        virt_virtio_md_pci_plug(hotplug_dev, dev, errp);
>>       }
>>   }
>>   
>> @@ -2588,6 +2651,8 @@ static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
>>   {
>>       if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
>>           virt_dimm_unplug_request(hotplug_dev, dev, errp);
>> +    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
>> +        virt_virtio_md_pci_unplug_request(hotplug_dev, dev, errp);
>>       } else {
>>           error_setg(errp, "device unplug request for unsupported device"
>>                      " type: %s", object_get_typename(OBJECT(dev)));
>> @@ -2611,7 +2676,8 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
>>       MachineClass *mc = MACHINE_GET_CLASS(machine);
>>   
>>       if (device_is_dynamic_sysbus(mc, dev) ||
>> -       (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM))) {
>> +        object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
>> +        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
>>           return HOTPLUG_HANDLER(machine);
>>       }
>>       if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
>> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
>> index d5a578142b..3033692a83 100644
>> --- a/hw/virtio/virtio-mem.c
>> +++ b/hw/virtio/virtio-mem.c
>> @@ -126,6 +126,8 @@ static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
>>    */
>>   #if defined(TARGET_X86_64) || defined(TARGET_I386)
>>   #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
>> +#elif defined(TARGET_ARM)
>> +#define VIRTIO_MEM_USABLE_EXTENT (2 * (1024 * MiB))
> 
> 
> Can we make this 512 MiB ?
> 
> arch/arm64/include/asm/sparsemem.h
> 
> /*
>   * Section size must be at least 512MB for 64K base
>   * page size config. Otherwise it will be less than
>   * (MAX_ORDER - 1) and the build process will fail.
>   */
> #ifdef CONFIG_ARM64_64K_PAGES
> #define SECTION_SIZE_BITS 29
> 
> #else
> 
> /*
>   * Section size must be at least 128MB for 4K base
>   * page size config. Otherwise PMD based huge page
>   * entries could not be created for vmemmap mappings.
>   * 16K follows 4K for simplicity.
>   */
> #define SECTION_SIZE_BITS 27
> #endif /* CONFIG_ARM64_64K_PAGES */
> 
> 
> Apart from that, LGTM -- thanks!
> 

Indeed. The scetion size has been changed by the following
linux commit. v2 will include the changes.

f0b13ee2324184 arm64/sparsemem: reduce SECTION_SIZE_BITS
(Wed Jan 20 21:29:13 2021 Sudarshan Rajagopalan <sudaraja@codeaurora.org>)

Thanks,
Gavin
David Hildenbrand Dec. 1, 2021, 9:03 a.m. UTC | #3
>>>
>>>     * It has been passing the tests with various combinations like 64KB
>>>       and 4KB page sizes on host and guest, different memory device
>>>       backends like normal, transparent huge page and HugeTLB, plus
>>>       migration.
>>
>> Perfect. A note that hugetlbfs isn't fully supported/safe to use until
>> we have preallocation support in QEMU (WIP).
>>
> 
> Yes, there is some warnings raised to enlarge 'request-size' on
> host with 64KB page size. Note that the memory backends I used
> in the testings always have "prealloc=on" property though.

1. prealloc=on

"prealloc=on" on the memory backend won't get the job done, because the first
thing virtio-mem does is discard all memory in the memory backend again when
it initializes. So it's an expensive NOP :) See

https://lkml.kernel.org/r/20211130104136.40927-9-david@redhat.com

for the virtio-mem "prealloc=on" option that preallocates memory when
exposing that memory to the VM.

To use huge pages in a safe way with virtio-mem, we need "reserve=off" on the
memory backend and "prealloc=on" on the virtio-mem device. (I'm in the process
of documenting that on virtio-mem.gitlab.io/ to make it clearer)


2. Warning on arm64 with 64k

I assume the warning you're seeing is regarding the block-size:

"Read unsupported THP size: ..." followed by
"Could not detect THP size, falling back to ..."

The right thing to do for now should be to remove that sanity check:

diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index d5a578142b..33c32afeb1 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -78,11 +78,8 @@ static uint32_t virtio_mem_thp_size(void)
     if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
         !qemu_strtou64(content, &endptr, 0, &tmp) &&
         (!endptr || *endptr == '\n')) {
-        /*
-         * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base
-         * pages) or weird, fallback to something smaller.
-         */
-        if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
+        /* Sanity-check the value and fallback to something reasonable.  */
+        if (!tmp || !is_power_of_2(tmp)) {
             warn_report("Read unsupported THP size: %" PRIx64, tmp);
         } else {
             thp_size = tmp;

This will not affect setups we care about ( x86-64 KVM ).

It will imply that with a arm64 64k host, we can only hot(un)plug in
512 MiB granularity when not using hugetlb witht he default block-size.
However, that is already the case with arm64 64k guests as well.
The suggestion will be to use arm64 4k with virtio-mem in the host and
the guest for increased flexibility -- fortunately most distros
already have performed the switch to 4k on arm64, so we don't really
care IMHO.

To support block_size < THP size when not using hugetlb,
we'll have to disable THP (via MADV_NOHUGEPAGE) permanently for the memory
region, also making sure that e.g., postcopy won't re-enable it by adding
a proper flag (RAM_NOHUGEPAGE) to the RAMBlock. Because the issue is that
once the guest touches some memory, we might populate a THP that would cover
plugged and unplugged memory, which is bad.

Instead of warning in virtio_mem_device_realize() when
	vmem->block_size < virtio_mem_default_block_size(rb)
we'd have to disable THP.


Further, we should fixup the default THP size on arm64 in case we're
running on an older kernel where we cannot sense the THP size:


diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index d5a578142b..371cee380a 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -38,13 +38,21 @@
  */
 #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
 
+static uint32_t virtio_mem_default_thp_size(void)
+{
+#if defined(__aarch64__)
+    if (qemu_real_host_page_size == 64 * KiB) {
+        return 512 * MiB;
+    }
+#endif
 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
     defined(__powerpc64__)
-#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB))
+    return 2 * MiB;
 #else
-        /* fallback to 1 MiB (e.g., the THP size on s390x) */
-#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE
+    /* fallback to 1 MiB (e.g., the THP size on s390x) */
+    return VIRTIO_MEM_MIN_BLOCK_SIZE;
 #endif
+}
 
 /*
  * We want to have a reasonable default block size such that
@@ -90,7 +98,7 @@ static uint32_t virtio_mem_thp_size(void)
     }
 
     if (!thp_size) {
-        thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE;
+        thp_size = virtio_mem_default_thp_size();
         warn_report("Could not detect THP size, falling back to %" PRIx64
                     "  MiB.", thp_size / MiB);
     }



In the context of proper arm64 support, adding the above two changes
should be good enough. If you agree, can you include them in your v2
series as a separate patch?

Supporting block_size < thp_size when not using hugetlb is a different
work IMHO, if someone ever cares about that.



Thanks!
Gavin Shan Dec. 3, 2021, 3:42 a.m. UTC | #4
On 12/1/21 8:03 PM, David Hildenbrand wrote:
>>>>
>>>>      * It has been passing the tests with various combinations like 64KB
>>>>        and 4KB page sizes on host and guest, different memory device
>>>>        backends like normal, transparent huge page and HugeTLB, plus
>>>>        migration.
>>>
>>> Perfect. A note that hugetlbfs isn't fully supported/safe to use until
>>> we have preallocation support in QEMU (WIP).
>>>
>>
>> Yes, there is some warnings raised to enlarge 'request-size' on
>> host with 64KB page size. Note that the memory backends I used
>> in the testings always have "prealloc=on" property though.
> 
> 1. prealloc=on
> 
> "prealloc=on" on the memory backend won't get the job done, because the first
> thing virtio-mem does is discard all memory in the memory backend again when
> it initializes. So it's an expensive NOP :) See
> 
> https://lkml.kernel.org/r/20211130104136.40927-9-david@redhat.com
> 
> for the virtio-mem "prealloc=on" option that preallocates memory when
> exposing that memory to the VM.
> 
> To use huge pages in a safe way with virtio-mem, we need "reserve=off" on the
> memory backend and "prealloc=on" on the virtio-mem device. (I'm in the process
> of documenting that on virtio-mem.gitlab.io/ to make it clearer)
> 

David, I will reply in a different thread for discussion purpose only. I
need some time for the investigation :)

> 
> 2. Warning on arm64 with 64k
> 
> I assume the warning you're seeing is regarding the block-size:
> 
> "Read unsupported THP size: ..." followed by
> "Could not detect THP size, falling back to ..."
> 
> The right thing to do for now should be to remove that sanity check:
> 
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index d5a578142b..33c32afeb1 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -78,11 +78,8 @@ static uint32_t virtio_mem_thp_size(void)
>       if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
>           !qemu_strtou64(content, &endptr, 0, &tmp) &&
>           (!endptr || *endptr == '\n')) {
> -        /*
> -         * Sanity-check the value, if it's too big (e.g., aarch64 with 64k base
> -         * pages) or weird, fallback to something smaller.
> -         */
> -        if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
> +        /* Sanity-check the value and fallback to something reasonable.  */
> +        if (!tmp || !is_power_of_2(tmp)) {
>               warn_report("Read unsupported THP size: %" PRIx64, tmp);
>           } else {
>               thp_size = tmp;
> 
> This will not affect setups we care about ( x86-64 KVM ).
> 
> It will imply that with a arm64 64k host, we can only hot(un)plug in
> 512 MiB granularity when not using hugetlb witht he default block-size.
> However, that is already the case with arm64 64k guests as well.
> The suggestion will be to use arm64 4k with virtio-mem in the host and
> the guest for increased flexibility -- fortunately most distros
> already have performed the switch to 4k on arm64, so we don't really
> care IMHO.
> 
> To support block_size < THP size when not using hugetlb,
> we'll have to disable THP (via MADV_NOHUGEPAGE) permanently for the memory
> region, also making sure that e.g., postcopy won't re-enable it by adding
> a proper flag (RAM_NOHUGEPAGE) to the RAMBlock. Because the issue is that
> once the guest touches some memory, we might populate a THP that would cover
> plugged and unplugged memory, which is bad.
> 
> Instead of warning in virtio_mem_device_realize() when
> 	vmem->block_size < virtio_mem_default_block_size(rb)
> we'd have to disable THP.
> 
> 
> Further, we should fixup the default THP size on arm64 in case we're
> running on an older kernel where we cannot sense the THP size:
> 
> 
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index d5a578142b..371cee380a 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -38,13 +38,21 @@
>    */
>   #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
>   
> +static uint32_t virtio_mem_default_thp_size(void)
> +{
> +#if defined(__aarch64__)
> +    if (qemu_real_host_page_size == 64 * KiB) {
> +        return 512 * MiB;
> +    }
> +#endif
>   #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
>       defined(__powerpc64__)
> -#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB))
> +    return 2 * MiB;
>   #else
> -        /* fallback to 1 MiB (e.g., the THP size on s390x) */
> -#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE
> +    /* fallback to 1 MiB (e.g., the THP size on s390x) */
> +    return VIRTIO_MEM_MIN_BLOCK_SIZE;
>   #endif
> +}
>   
>   /*
>    * We want to have a reasonable default block size such that
> @@ -90,7 +98,7 @@ static uint32_t virtio_mem_thp_size(void)
>       }
>   
>       if (!thp_size) {
> -        thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE;
> +        thp_size = virtio_mem_default_thp_size();
>           warn_report("Could not detect THP size, falling back to %" PRIx64
>                       "  MiB.", thp_size / MiB);
>       }
> 
> 
> 
> In the context of proper arm64 support, adding the above two changes
> should be good enough. If you agree, can you include them in your v2
> series as a separate patch?
> 
> Supporting block_size < thp_size when not using hugetlb is a different
> work IMHO, if someone ever cares about that.
> 
> 

Yeah, It's exactly the warnings I observed and I agree on the changes
except the 16KB-base-page-size case is missed. I've included all the
changes into separate patch in v2, which was just posted.

Thanks,
Gavin
diff mbox series

Patch

diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index 2d37d29f02..15aff8efb8 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -27,6 +27,7 @@  config ARM_VIRT
     select DIMM
     select ACPI_HW_REDUCED
     select ACPI_APEI
+    select VIRTIO_MEM_SUPPORTED
 
 config CHEETAH
     bool
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 369552ad45..f4599a5ef0 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -71,9 +71,11 @@ 
 #include "hw/arm/smmuv3.h"
 #include "hw/acpi/acpi.h"
 #include "target/arm/internals.h"
+#include "hw/mem/memory-device.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/mem/nvdimm.h"
 #include "hw/acpi/generic_event_device.h"
+#include "hw/virtio/virtio-mem-pci.h"
 #include "hw/virtio/virtio-iommu.h"
 #include "hw/char/pl011.h"
 #include "qemu/guest-random.h"
@@ -2480,6 +2482,63 @@  static void virt_memory_plug(HotplugHandler *hotplug_dev,
                          dev, &error_abort);
 }
 
+static void virt_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev,
+                                        DeviceState *dev, Error **errp)
+{
+    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+    Error *local_err = NULL;
+
+    if (!hotplug_dev2 && dev->hotplugged) {
+        /*
+         * Without a bus hotplug handler, we cannot control the plug/unplug
+         * order. We should never reach this point when hotplugging on x86,
+         * however, better add a safety net.
+         */
+        error_setg(errp, "hotplug of virtio based memory devices not supported"
+                   " on this bus.");
+        return;
+    }
+    /*
+     * First, see if we can plug this memory device at all. If that
+     * succeeds, branch of to the actual hotplug handler.
+     */
+    memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
+                           &local_err);
+    if (!local_err && hotplug_dev2) {
+        hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err);
+    }
+    error_propagate(errp, local_err);
+}
+
+static void virt_virtio_md_pci_plug(HotplugHandler *hotplug_dev,
+                                    DeviceState *dev, Error **errp)
+{
+    HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
+    Error *local_err = NULL;
+
+    /*
+     * Plug the memory device first and then branch off to the actual
+     * hotplug handler. If that one fails, we can easily undo the memory
+     * device bits.
+     */
+    memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+    if (hotplug_dev2) {
+        hotplug_handler_plug(hotplug_dev2, dev, &local_err);
+        if (local_err) {
+            memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
+        }
+    }
+    error_propagate(errp, local_err);
+}
+
+static void virt_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev,
+                                              DeviceState *dev, Error **errp)
+{
+    /* We don't support hot unplug of virtio based memory devices */
+    error_setg(errp, "virtio based memory devices cannot be unplugged.");
+}
+
+
 static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
                                             DeviceState *dev, Error **errp)
 {
@@ -2513,6 +2572,8 @@  static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
         qdev_prop_set_uint32(dev, "len-reserved-regions", 1);
         qdev_prop_set_string(dev, "reserved-regions[0]", resv_prop_str);
         g_free(resv_prop_str);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+        virt_virtio_md_pci_pre_plug(hotplug_dev, dev, errp);
     }
 }
 
@@ -2538,6 +2599,8 @@  static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
         vms->iommu = VIRT_IOMMU_VIRTIO;
         vms->virtio_iommu_bdf = pci_get_bdf(pdev);
         create_virtio_iommu_dt_bindings(vms);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+        virt_virtio_md_pci_plug(hotplug_dev, dev, errp);
     }
 }
 
@@ -2588,6 +2651,8 @@  static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
 {
     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
         virt_dimm_unplug_request(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
+        virt_virtio_md_pci_unplug_request(hotplug_dev, dev, errp);
     } else {
         error_setg(errp, "device unplug request for unsupported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -2611,7 +2676,8 @@  static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
     MachineClass *mc = MACHINE_GET_CLASS(machine);
 
     if (device_is_dynamic_sysbus(mc, dev) ||
-       (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM))) {
+        object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
         return HOTPLUG_HANDLER(machine);
     }
     if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index d5a578142b..3033692a83 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -126,6 +126,8 @@  static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
  */
 #if defined(TARGET_X86_64) || defined(TARGET_I386)
 #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
+#elif defined(TARGET_ARM)
+#define VIRTIO_MEM_USABLE_EXTENT (2 * (1024 * MiB))
 #else
 #error VIRTIO_MEM_USABLE_EXTENT not defined
 #endif