Message ID | 20180516152026.2920-7-shameerali.kolothum.thodi@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Shameer, On 05/16/2018 05:20 PM, Shameer Kolothum wrote: > In case valid iova regions are non-contiguous, split the > RAM mem into a 1GB non-pluggable dimm and remaining as a > single pc-dimm mem. Please can you explain where does this split come from? Currently we have 254 GB non pluggable RAM. I read the discussion started with Drew on RFC v1 where he explained we cannot change the RAM base without crashing the FW. However we should at least document this 1GB split. > > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> > --- > hw/arm/virt.c | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 256 insertions(+), 5 deletions(-) > > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > index be3ad14..562e389 100644 > --- a/hw/arm/virt.c > +++ b/hw/arm/virt.c > @@ -58,6 +58,12 @@ > #include "hw/smbios/smbios.h" > #include "qapi/visitor.h" > #include "standard-headers/linux/input.h" > +#include "hw/vfio/vfio-common.h" > +#include "qemu/config-file.h" > +#include "monitor/qdev.h" > +#include "qom/object_interfaces.h" > +#include "qapi/qmp/qdict.h" > +#include "qemu/option.h" The comment at the beginning of virt.c would need to be reworked with your changes. > > #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ > static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ > @@ -110,7 +116,10 @@ static ARMPlatformBusSystemParams platform_bus_params; > * terabyte of physical address space.) > */ > #define RAMLIMIT_GB 255 > -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024) > +#define SZ_1G (1024ULL * 1024 * 1024) > +#define RAMLIMIT_BYTES (RAMLIMIT_GB * SZ_1G) > + > +#define ALIGN_1G (1ULL << 30) > > /* Addresses and sizes of our components. > * 0..128MB is space for a flash device so we can run bootrom code such as UEFI. > @@ -1171,6 +1180,236 @@ void virt_machine_done(Notifier *notifier, void *data) > virt_build_smbios(vms); > } > > +static void free_iova_copy(struct vfio_iova_head *iova_copy) > +{ > + VFIOIovaRange *iova, *tmp; > + > + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { > + QLIST_REMOVE(iova, next); > + g_free(iova); > + } > +} > + > +static void get_iova_copy(struct vfio_iova_head *iova_copy) > +{ > + VFIOIovaRange *iova, *new, *prev_iova = NULL; > + > + QLIST_FOREACH(iova, &vfio_iova_regions, next) { > + new = g_malloc0(sizeof(*iova)); > + new->start = iova->start; > + new->end = iova->end; > + > + if (prev_iova) { > + QLIST_INSERT_AFTER(prev_iova, new, next); > + } else { > + QLIST_INSERT_HEAD(iova_copy, new, next); > + } > + prev_iova = new; > + } > +} > + > +static hwaddr find_memory_chunk(VirtMachineState *vms, > + struct vfio_iova_head *iova_copy, > + hwaddr req_size, bool pcdimm) > +{ > + VFIOIovaRange *iova, *tmp; > + hwaddr new_start, new_size, sz_align; > + hwaddr virt_start = vms->memmap[VIRT_MEM].base; > + hwaddr addr_align = ALIGN_1G; /* Set to max ARM64 hugepage size */ > + > + /* Size alignment */ > + sz_align = (pcdimm) ? MAX(TARGET_PAGE_SIZE, QEMU_VMALLOC_ALIGN) : > + TARGET_PAGE_SIZE; > + > + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { > + if (virt_start >= iova->end) { > + continue; > + } > + > + /* Align addr */ > + new_start = ROUND_UP(MAX(virt_start, iova->start), addr_align); > + if (new_start >= iova->end) { > + continue; > + } > + > + if (req_size > iova->end - new_start + 1) { > + continue; > + } don't you want to check directly with new_size? > + > + /* > + * Check the region can hold any size alignment requirement. > + */ > + new_size = QEMU_ALIGN_UP(req_size, sz_align); > + > + if ((new_start + new_size - 1 > iova->end) || > + (new_start + new_size >= virt_start + RAMLIMIT_BYTES)) { > + continue; > + } > + > + /* > + * Modify the iova list entry for non pc-dimm case so that it > + * is not used again for pc-dimm allocation. > + */ > + if (!pcdimm) { > + if (new_size - req_size) { I don't get this check, Don't you want to check the node's range is fully consumed and in the positive remove the node? (new_size != iova->end - iova-> start + 1) > + iova->start = new_start + new_size; > + } else { > + QLIST_REMOVE(iova, next); > + } > + } > + return new_start; > + } > + > + return -1; > +} > + > +static void update_memory_regions(VirtMachineState *vms) > +{ > + hwaddr addr; > + VFIOIovaRange *iova; > + MachineState *machine = MACHINE(vms); > + hwaddr virt_start = vms->memmap[VIRT_MEM].base; > + hwaddr req_size, ram_size = machine->ram_size; > + struct vfio_iova_head iova_copy = QLIST_HEAD_INITIALIZER(iova_copy); > + > + /* No valid iova regions, use default */ > + if (QLIST_EMPTY(&vfio_iova_regions)) { > + vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; > + vms->bootinfo.ram_size = ram_size; > + return; > + } > + > + /* > + * If valid iovas has only one entry, check the req size fits in > + * and can have the loader start < 4GB. This will make sure platforms > + * with no holes in mem will have the same mem model as before. > + */ > + req_size = ram_size; > + iova = QLIST_NEXT(QLIST_FIRST(&vfio_iova_regions), next); > + if (!iova) { > + iova = QLIST_FIRST(&vfio_iova_regions); > + addr = ROUND_UP(MAX(virt_start, iova->start), ALIGN_1G); > + if ((addr < 4 * SZ_1G) && (ram_size <= iova->end - addr + 1) && > + (addr + ram_size < virt_start + RAMLIMIT_BYTES)) { > + vms->bootinfo.loader_start = addr; > + vms->bootinfo.ram_size = ram_size; > + return; > + } > + } > + > + /* Get a copy of valid iovas and work on it */ > + get_iova_copy(&iova_copy); > + > + /* Split the mem as first 1GB non-pluggable and rest as pc-dimm */ > + req_size = MIN(ram_size, SZ_1G); > + addr = find_memory_chunk(vms, &iova_copy, req_size, false); According to what Drew reported, the base address of the cold-plug RAM must stay at 1GB otherwise the FW will be lost. So I think you must check the addr = 1GB > + if (addr == -1 || addr >= 4 * SZ_1G) { > + goto out; > + } > + > + /*Update non-pluggable mem details */ > + machine->ram_size = req_size; > + vms->bootinfo.loader_start = addr; > + vms->bootinfo.ram_size = machine->ram_size; > + > + req_size = ram_size - req_size; > + if (!req_size) { > + goto done; > + } > + > + /* Remaining memory is modeled as a pc-dimm. */ > + addr = find_memory_chunk(vms, &iova_copy, req_size, true); > + if (addr == -1) { > + goto out; > + } > + > + /*Update pc-dimm mem details */ > + vms->bootinfo.dimm_mem = g_new(struct dimm_mem_info, 1); > + vms->bootinfo.dimm_mem->base = addr; > + vms->bootinfo.dimm_mem->size = req_size; > + machine->maxram_size = machine->ram_size + req_size; > + machine->ram_slots += 1; > + > +done: > + free_iova_copy(&iova_copy); > + return; > + > +out: > + free_iova_copy(&iova_copy); > + error_report("mach-virt: Not enough contiguous memory to model ram"); Output the requested size would help debug (and maybe the available IOVA ranges) Thanks Eric > + exit(1); > +} > + > +static void create_pcdimms(VirtMachineState *vms, > + MemoryRegion *sysmem, > + MemoryRegion *ram) > +{ > + hwaddr addr, size; > + Error *local_err = NULL; > + QDict *qdict; > + QemuOpts *opts; > + char *tmp; > + > + if (!vms->bootinfo.dimm_mem) { > + return; > + } > + > + addr = vms->bootinfo.dimm_mem->base; > + size = vms->bootinfo.dimm_mem->size; > + > + /*Create hotplug address space */ > + vms->hotplug_memory.base = ROUND_UP(addr, ALIGN_1G); > + size = ROUND_UP(size, MAX(TARGET_PAGE_SIZE, QEMU_VMALLOC_ALIGN)); > + > + memory_region_init(&vms->hotplug_memory.mr, OBJECT(vms), > + "hotplug-memory", size); > + memory_region_add_subregion(sysmem, vms->hotplug_memory.base, > + &vms->hotplug_memory.mr); > + /* Create backend mem object */ > + qdict = qdict_new(); > + qdict_put_str(qdict, "qom-type", "memory-backend-ram"); > + qdict_put_str(qdict, "id", "mem1"); > + tmp = g_strdup_printf("%"PRIu64 "M", size / (1024 * 1024)); > + qdict_put_str(qdict, "size", tmp); > + g_free(tmp); > + > + opts = qemu_opts_from_qdict(qemu_find_opts("object"), qdict, &local_err); > + if (local_err) { > + goto err; > + } > + > + user_creatable_add_opts(opts, &local_err); > + qemu_opts_del(opts); > + QDECREF(qdict); > + if (local_err) { > + goto err; > + } > + > + /* Create pc-dimm dev*/ > + qdict = qdict_new(); > + qdict_put_str(qdict, "driver", "pc-dimm"); > + qdict_put_str(qdict, "id", "dimm1"); > + qdict_put_str(qdict, "memdev", "mem1"); > + > + opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, &local_err); > + if (local_err) { > + goto err; > + } > + > + qdev_device_add(opts, &local_err); > + qemu_opts_del(opts); > + QDECREF(qdict); > + if (local_err) { > + goto err; > + } > + > + return; > + > +err: > + error_report_err(local_err); > + exit(1); > +} > + > static void virt_ram_memory_region_init(Notifier *notifier, void *data) > { > MemoryRegion *sysmem = get_system_memory(); > @@ -1179,9 +1418,14 @@ static void virt_ram_memory_region_init(Notifier *notifier, void *data) > ram_memory_region_init); > MachineState *machine = MACHINE(vms); > > + update_memory_regions(vms); > memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram", > machine->ram_size); > - memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram); > + memory_region_add_subregion(sysmem, vms->bootinfo.loader_start, ram); > + > + if (vms->bootinfo.dimm_mem) { > + create_pcdimms(vms, sysmem, ram); > + } > } > > static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx) > @@ -1404,13 +1648,11 @@ static void machvirt_init(MachineState *machine) > vms->machine_done.notify = virt_machine_done; > qemu_add_machine_init_done_notifier(&vms->machine_done); > > - vms->bootinfo.ram_size = machine->ram_size; > vms->bootinfo.kernel_filename = machine->kernel_filename; > vms->bootinfo.kernel_cmdline = machine->kernel_cmdline; > vms->bootinfo.initrd_filename = machine->initrd_filename; > vms->bootinfo.nb_cpus = smp_cpus; > vms->bootinfo.board_id = -1; > - vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; > vms->bootinfo.get_dtb = machvirt_dtb; > vms->bootinfo.firmware_loaded = firmware_loaded; > > @@ -1559,7 +1801,7 @@ static void virt_dimm_plug(HotplugHandler *hotplug_dev, > PCDIMMDevice *dimm = PC_DIMM(dev); > PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); > MemoryRegion *mr; > - uint64_t align; > + uint64_t align, addr; > Error *local_err = NULL; > > mr = ddc->get_memory_region(dimm, &local_err); > @@ -1573,6 +1815,15 @@ static void virt_dimm_plug(HotplugHandler *hotplug_dev, > goto out; > } > > + addr = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP, > + &error_fatal); > + /* Assign the node for pc-dimm initial ram */ > + if (vms->bootinfo.dimm_mem && (addr == vms->bootinfo.dimm_mem->base) > + && (nb_numa_nodes > 0)) { > + vms->bootinfo.dimm_mem->node = object_property_get_uint(OBJECT(dev), > + PC_DIMM_NODE_PROP, &error_fatal); > + } > + > out: > error_propagate(errp, local_err); > } >
> -----Original Message----- > From: Auger Eric [mailto:eric.auger@redhat.com] > Sent: Monday, May 28, 2018 3:22 PM > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>; > qemu-devel@nongnu.org; qemu-arm@nongnu.org > Cc: drjones@redhat.com; imammedo@redhat.com; peter.maydell@linaro.org; > alex.williamson@redhat.com; Zhaoshenglong <zhaoshenglong@huawei.com>; > Jonathan Cameron <jonathan.cameron@huawei.com>; Linuxarm > <linuxarm@huawei.com> > Subject: Re: [RFC v2 6/6] hw/arm: Populate non-contiguous memory regions > > Hi Shameer, > > On 05/16/2018 05:20 PM, Shameer Kolothum wrote: > > In case valid iova regions are non-contiguous, split the > > RAM mem into a 1GB non-pluggable dimm and remaining as a > > single pc-dimm mem. > > Please can you explain where does this split come from? Currently we > have 254 GB non pluggable RAM. I read the discussion started with Drew > on RFC v1 where he explained we cannot change the RAM base without > crashing the FW. However we should at least document this 1GB split. The comments were mainly to use the pc-dimm model instead of "mem alias" method used on RFC v1 as this will help to add the mem hot-plug support in future. I am not sure about the motive behind Drew's idea of splitting the first 1GB as non-plug and remaining as a pc-dimm(cold). May it is to attempt a best effort scenario, but as I mentioned in reply to 0/6, the current solution will end up changing the base address if the 0x4000000 falls under a reserved region. Again, not sure whether we should enforce a strict check on base address start or just warn the user that it will fail on Guest with UEFI boot[1]. Hi Drew, Please let me know your thoughts on this. > > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> > > --- > > hw/arm/virt.c | 261 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > > 1 file changed, 256 insertions(+), 5 deletions(-) > > > > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > > index be3ad14..562e389 100644 > > --- a/hw/arm/virt.c > > +++ b/hw/arm/virt.c > > @@ -58,6 +58,12 @@ > > #include "hw/smbios/smbios.h" > > #include "qapi/visitor.h" > > #include "standard-headers/linux/input.h" > > +#include "hw/vfio/vfio-common.h" > > +#include "qemu/config-file.h" > > +#include "monitor/qdev.h" > > +#include "qom/object_interfaces.h" > > +#include "qapi/qmp/qdict.h" > > +#include "qemu/option.h" > > The comment at the beginning of virt.c would need to be reworked with > your changes. Ok. > > #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ > > static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ > > @@ -110,7 +116,10 @@ static ARMPlatformBusSystemParams > platform_bus_params; > > * terabyte of physical address space.) > > */ > > #define RAMLIMIT_GB 255 > > -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024) > > +#define SZ_1G (1024ULL * 1024 * 1024) > > +#define RAMLIMIT_BYTES (RAMLIMIT_GB * SZ_1G) > > + > > +#define ALIGN_1G (1ULL << 30) > > > > /* Addresses and sizes of our components. > > * 0..128MB is space for a flash device so we can run bootrom code such as > UEFI. > > @@ -1171,6 +1180,236 @@ void virt_machine_done(Notifier *notifier, void > *data) > > virt_build_smbios(vms); > > } > > > > +static void free_iova_copy(struct vfio_iova_head *iova_copy) > > +{ > > + VFIOIovaRange *iova, *tmp; > > + > > + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { > > + QLIST_REMOVE(iova, next); > > + g_free(iova); > > + } > > +} > > + > > +static void get_iova_copy(struct vfio_iova_head *iova_copy) > > +{ > > + VFIOIovaRange *iova, *new, *prev_iova = NULL; > > + > > + QLIST_FOREACH(iova, &vfio_iova_regions, next) { > > + new = g_malloc0(sizeof(*iova)); > > + new->start = iova->start; > > + new->end = iova->end; > > + > > + if (prev_iova) { > > + QLIST_INSERT_AFTER(prev_iova, new, next); > > + } else { > > + QLIST_INSERT_HEAD(iova_copy, new, next); > > + } > > + prev_iova = new; > > + } > > +} > > + > > +static hwaddr find_memory_chunk(VirtMachineState *vms, > > + struct vfio_iova_head *iova_copy, > > + hwaddr req_size, bool pcdimm) > > +{ > > + VFIOIovaRange *iova, *tmp; > > + hwaddr new_start, new_size, sz_align; > > + hwaddr virt_start = vms->memmap[VIRT_MEM].base; > > + hwaddr addr_align = ALIGN_1G; /* Set to max ARM64 hugepage size */ > > + > > + /* Size alignment */ > > + sz_align = (pcdimm) ? MAX(TARGET_PAGE_SIZE, > QEMU_VMALLOC_ALIGN) : > > + TARGET_PAGE_SIZE; > > + > > + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { > > + if (virt_start >= iova->end) { > > + continue; > > + } > > + > > + /* Align addr */ > > + new_start = ROUND_UP(MAX(virt_start, iova->start), addr_align); > > + if (new_start >= iova->end) { > > + continue; > > + } > > + > > + if (req_size > iova->end - new_start + 1) { > > + continue; > > + } > don't you want to check directly with new_size? Yes. I think that should do. > > + > > + /* > > + * Check the region can hold any size alignment requirement. > > + */ > > + new_size = QEMU_ALIGN_UP(req_size, sz_align); > > + > > + if ((new_start + new_size - 1 > iova->end) || > > + (new_start + new_size >= virt_start + RAMLIMIT_BYTES)) { > > + continue; > > + } > > + > > + /* > > + * Modify the iova list entry for non pc-dimm case so that it > > + * is not used again for pc-dimm allocation. > > + */ > > + if (!pcdimm) { > > + if (new_size - req_size) { > I don't get this check, Don't you want to check the node's range is > fully consumed and in the positive remove the node? > > (new_size != iova->end - iova-> start + 1) Hmm..looks like I missed that. > > + iova->start = new_start + new_size; > > + } else { > > + QLIST_REMOVE(iova, next); > > + } > > + } > > + return new_start; > > + } > > + > > + return -1; > > +} > > + > > +static void update_memory_regions(VirtMachineState *vms) > > +{ > > + hwaddr addr; > > + VFIOIovaRange *iova; > > + MachineState *machine = MACHINE(vms); > > + hwaddr virt_start = vms->memmap[VIRT_MEM].base; > > + hwaddr req_size, ram_size = machine->ram_size; > > + struct vfio_iova_head iova_copy = QLIST_HEAD_INITIALIZER(iova_copy); > > + > > + /* No valid iova regions, use default */ > > + if (QLIST_EMPTY(&vfio_iova_regions)) { > > + vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; > > + vms->bootinfo.ram_size = ram_size; > > + return; > > + } > > + > > + /* > > + * If valid iovas has only one entry, check the req size fits in > > + * and can have the loader start < 4GB. This will make sure platforms > > + * with no holes in mem will have the same mem model as before. > > + */ > > + req_size = ram_size; > > + iova = QLIST_NEXT(QLIST_FIRST(&vfio_iova_regions), next); > > + if (!iova) { > > + iova = QLIST_FIRST(&vfio_iova_regions); > > + addr = ROUND_UP(MAX(virt_start, iova->start), ALIGN_1G); > > + if ((addr < 4 * SZ_1G) && (ram_size <= iova->end - addr + 1) && > > + (addr + ram_size < virt_start + RAMLIMIT_BYTES)) { > > + vms->bootinfo.loader_start = addr; > > + vms->bootinfo.ram_size = ram_size; > > + return; > > + } > > + } > > + > > + /* Get a copy of valid iovas and work on it */ > > + get_iova_copy(&iova_copy); > > + > > + /* Split the mem as first 1GB non-pluggable and rest as pc-dimm */ > > + req_size = MIN(ram_size, SZ_1G); > > + addr = find_memory_chunk(vms, &iova_copy, req_size, false); > According to what Drew reported, the base address of the cold-plug RAM > must stay at 1GB otherwise the FW will be lost. So I think you must > check the addr = 1GB Please see my above comment on the base address. > > + if (addr == -1 || addr >= 4 * SZ_1G) { > > + goto out; > > + } > > + > > + /*Update non-pluggable mem details */ > > + machine->ram_size = req_size; > > + vms->bootinfo.loader_start = addr; > > + vms->bootinfo.ram_size = machine->ram_size; > > + > > + req_size = ram_size - req_size; > > + if (!req_size) { > > + goto done; > > + } > > + > > + /* Remaining memory is modeled as a pc-dimm. */ > > + addr = find_memory_chunk(vms, &iova_copy, req_size, true); > > + if (addr == -1) { > > + goto out; > > + } > > + > > + /*Update pc-dimm mem details */ > > + vms->bootinfo.dimm_mem = g_new(struct dimm_mem_info, 1); > > + vms->bootinfo.dimm_mem->base = addr; > > + vms->bootinfo.dimm_mem->size = req_size; > > + machine->maxram_size = machine->ram_size + req_size; > > + machine->ram_slots += 1; > > + > > +done: > > + free_iova_copy(&iova_copy); > > + return; > > + > > +out: > > + free_iova_copy(&iova_copy); > > + error_report("mach-virt: Not enough contiguous memory to model ram"); > Output the requested size would help debug (and maybe the available IOVA > ranges) Agree. I will change that. Thanks, Shameer [1] https://github.com/tianocore/edk2/blob/master/ArmVirtPkg/ArmVirtQemu.dsc#L133 > Thanks > > Eric > > + exit(1); > > +} > > + > > +static void create_pcdimms(VirtMachineState *vms, > > + MemoryRegion *sysmem, > > + MemoryRegion *ram) > > +{ > > + hwaddr addr, size; > > + Error *local_err = NULL; > > + QDict *qdict; > > + QemuOpts *opts; > > + char *tmp; > > + > > + if (!vms->bootinfo.dimm_mem) { > > + return; > > + } > > + > > + addr = vms->bootinfo.dimm_mem->base; > > + size = vms->bootinfo.dimm_mem->size; > > + > > + /*Create hotplug address space */ > > + vms->hotplug_memory.base = ROUND_UP(addr, ALIGN_1G); > > + size = ROUND_UP(size, MAX(TARGET_PAGE_SIZE, > QEMU_VMALLOC_ALIGN)); > > + > > + memory_region_init(&vms->hotplug_memory.mr, OBJECT(vms), > > + "hotplug-memory", size); > > + memory_region_add_subregion(sysmem, vms->hotplug_memory.base, > > + &vms->hotplug_memory.mr); > > + /* Create backend mem object */ > > + qdict = qdict_new(); > > + qdict_put_str(qdict, "qom-type", "memory-backend-ram"); > > + qdict_put_str(qdict, "id", "mem1"); > > + tmp = g_strdup_printf("%"PRIu64 "M", size / (1024 * 1024)); > > + qdict_put_str(qdict, "size", tmp); > > + g_free(tmp); > > + > > + opts = qemu_opts_from_qdict(qemu_find_opts("object"), qdict, > &local_err); > > + if (local_err) { > > + goto err; > > + } > > + > > + user_creatable_add_opts(opts, &local_err); > > + qemu_opts_del(opts); > > + QDECREF(qdict); > > + if (local_err) { > > + goto err; > > + } > > + > > + /* Create pc-dimm dev*/ > > + qdict = qdict_new(); > > + qdict_put_str(qdict, "driver", "pc-dimm"); > > + qdict_put_str(qdict, "id", "dimm1"); > > + qdict_put_str(qdict, "memdev", "mem1"); > > + > > + opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, > &local_err); > > + if (local_err) { > > + goto err; > > + } > > + > > + qdev_device_add(opts, &local_err); > > + qemu_opts_del(opts); > > + QDECREF(qdict); > > + if (local_err) { > > + goto err; > > + } > > + > > + return; > > + > > +err: > > + error_report_err(local_err); > > + exit(1); > > +} > > + > > static void virt_ram_memory_region_init(Notifier *notifier, void *data) > > { > > MemoryRegion *sysmem = get_system_memory(); > > @@ -1179,9 +1418,14 @@ static void virt_ram_memory_region_init(Notifier > *notifier, void *data) > > ram_memory_region_init); > > MachineState *machine = MACHINE(vms); > > > > + update_memory_regions(vms); > > memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram", > > machine->ram_size); > > - memory_region_add_subregion(sysmem, vms- > >memmap[VIRT_MEM].base, ram); > > + memory_region_add_subregion(sysmem, vms->bootinfo.loader_start, > ram); > > + > > + if (vms->bootinfo.dimm_mem) { > > + create_pcdimms(vms, sysmem, ram); > > + } > > } > > > > static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx) > > @@ -1404,13 +1648,11 @@ static void machvirt_init(MachineState > *machine) > > vms->machine_done.notify = virt_machine_done; > > qemu_add_machine_init_done_notifier(&vms->machine_done); > > > > - vms->bootinfo.ram_size = machine->ram_size; > > vms->bootinfo.kernel_filename = machine->kernel_filename; > > vms->bootinfo.kernel_cmdline = machine->kernel_cmdline; > > vms->bootinfo.initrd_filename = machine->initrd_filename; > > vms->bootinfo.nb_cpus = smp_cpus; > > vms->bootinfo.board_id = -1; > > - vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; > > vms->bootinfo.get_dtb = machvirt_dtb; > > vms->bootinfo.firmware_loaded = firmware_loaded; > > > > @@ -1559,7 +1801,7 @@ static void virt_dimm_plug(HotplugHandler > *hotplug_dev, > > PCDIMMDevice *dimm = PC_DIMM(dev); > > PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); > > MemoryRegion *mr; > > - uint64_t align; > > + uint64_t align, addr; > > Error *local_err = NULL; > > > > mr = ddc->get_memory_region(dimm, &local_err); > > @@ -1573,6 +1815,15 @@ static void virt_dimm_plug(HotplugHandler > *hotplug_dev, > > goto out; > > } > > > > + addr = object_property_get_uint(OBJECT(dimm), > PC_DIMM_ADDR_PROP, > > + &error_fatal); > > + /* Assign the node for pc-dimm initial ram */ > > + if (vms->bootinfo.dimm_mem && (addr == vms->bootinfo.dimm_mem- > >base) > > + && (nb_numa_nodes > 0)) { > > + vms->bootinfo.dimm_mem->node = > object_property_get_uint(OBJECT(dev), > > + PC_DIMM_NODE_PROP, &error_fatal); > > + } > > + > > out: > > error_propagate(errp, local_err); > > } > >
Hi Drew, > -----Original Message----- > From: Shameerali Kolothum Thodi > Sent: 30 May 2018 15:49 > To: 'Auger Eric' <eric.auger@redhat.com>; qemu-devel@nongnu.org; qemu- > arm@nongnu.org > Cc: drjones@redhat.com; imammedo@redhat.com; peter.maydell@linaro.org; > alex.williamson@redhat.com; Zhaoshenglong <zhaoshenglong@huawei.com>; > Jonathan Cameron <jonathan.cameron@huawei.com>; Linuxarm > <linuxarm@huawei.com> > Subject: RE: [RFC v2 6/6] hw/arm: Populate non-contiguous memory regions > > > > > -----Original Message----- > > From: Auger Eric [mailto:eric.auger@redhat.com] > > Sent: Monday, May 28, 2018 3:22 PM > > To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>; > > qemu-devel@nongnu.org; qemu-arm@nongnu.org > > Cc: drjones@redhat.com; imammedo@redhat.com; > peter.maydell@linaro.org; > > alex.williamson@redhat.com; Zhaoshenglong > <zhaoshenglong@huawei.com>; > > Jonathan Cameron <jonathan.cameron@huawei.com>; Linuxarm > > <linuxarm@huawei.com> > > Subject: Re: [RFC v2 6/6] hw/arm: Populate non-contiguous memory regions > > > > Hi Shameer, > > > > On 05/16/2018 05:20 PM, Shameer Kolothum wrote: > > > In case valid iova regions are non-contiguous, split the > > > RAM mem into a 1GB non-pluggable dimm and remaining as a > > > single pc-dimm mem. > > > > Please can you explain where does this split come from? Currently we > > have 254 GB non pluggable RAM. I read the discussion started with Drew > > on RFC v1 where he explained we cannot change the RAM base without > > crashing the FW. However we should at least document this 1GB split. > > The comments were mainly to use the pc-dimm model instead of "mem alias" > method used on RFC v1 as this will help to add the mem hot-plug support > in future. > > I am not sure about the motive behind Drew's idea of splitting the first > 1GB as non-plug and remaining as a pc-dimm(cold). May it is to attempt a > best effort scenario, but as I mentioned in reply to 0/6, the current solution > will end up changing the base address if the 0x4000000 falls under a reserved > region. > > Again, not sure whether we should enforce a strict check on base address > start or just warn the user that it will fail on Guest with UEFI boot[1]. > > Hi Drew, > > Please let me know your thoughts on this. Could you please take a look at the above discussion and let us know your thoughts on the split of mem regions as 1GB non-pluggable and rest as pc-dimm. Thanks, Shameer > > > Signed-off-by: Shameer Kolothum > <shameerali.kolothum.thodi@huawei.com> > > > --- > > > hw/arm/virt.c | 261 > > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > > > 1 file changed, 256 insertions(+), 5 deletions(-) > > > > > > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > > > index be3ad14..562e389 100644 > > > --- a/hw/arm/virt.c > > > +++ b/hw/arm/virt.c > > > @@ -58,6 +58,12 @@ > > > #include "hw/smbios/smbios.h" > > > #include "qapi/visitor.h" > > > #include "standard-headers/linux/input.h" > > > +#include "hw/vfio/vfio-common.h" > > > +#include "qemu/config-file.h" > > > +#include "monitor/qdev.h" > > > +#include "qom/object_interfaces.h" > > > +#include "qapi/qmp/qdict.h" > > > +#include "qemu/option.h" > > > > The comment at the beginning of virt.c would need to be reworked with > > your changes. > > Ok. > > > > #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ > > > static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ > > > @@ -110,7 +116,10 @@ static ARMPlatformBusSystemParams > > platform_bus_params; > > > * terabyte of physical address space.) > > > */ > > > #define RAMLIMIT_GB 255 > > > -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024) > > > +#define SZ_1G (1024ULL * 1024 * 1024) > > > +#define RAMLIMIT_BYTES (RAMLIMIT_GB * SZ_1G) > > > + > > > +#define ALIGN_1G (1ULL << 30) > > > > > > /* Addresses and sizes of our components. > > > * 0..128MB is space for a flash device so we can run bootrom code such as > > UEFI. > > > @@ -1171,6 +1180,236 @@ void virt_machine_done(Notifier *notifier, > void > > *data) > > > virt_build_smbios(vms); > > > } > > > > > > +static void free_iova_copy(struct vfio_iova_head *iova_copy) > > > +{ > > > + VFIOIovaRange *iova, *tmp; > > > + > > > + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { > > > + QLIST_REMOVE(iova, next); > > > + g_free(iova); > > > + } > > > +} > > > + > > > +static void get_iova_copy(struct vfio_iova_head *iova_copy) > > > +{ > > > + VFIOIovaRange *iova, *new, *prev_iova = NULL; > > > + > > > + QLIST_FOREACH(iova, &vfio_iova_regions, next) { > > > + new = g_malloc0(sizeof(*iova)); > > > + new->start = iova->start; > > > + new->end = iova->end; > > > + > > > + if (prev_iova) { > > > + QLIST_INSERT_AFTER(prev_iova, new, next); > > > + } else { > > > + QLIST_INSERT_HEAD(iova_copy, new, next); > > > + } > > > + prev_iova = new; > > > + } > > > +} > > > + > > > +static hwaddr find_memory_chunk(VirtMachineState *vms, > > > + struct vfio_iova_head *iova_copy, > > > + hwaddr req_size, bool pcdimm) > > > +{ > > > + VFIOIovaRange *iova, *tmp; > > > + hwaddr new_start, new_size, sz_align; > > > + hwaddr virt_start = vms->memmap[VIRT_MEM].base; > > > + hwaddr addr_align = ALIGN_1G; /* Set to max ARM64 hugepage size */ > > > + > > > + /* Size alignment */ > > > + sz_align = (pcdimm) ? MAX(TARGET_PAGE_SIZE, > > QEMU_VMALLOC_ALIGN) : > > > + TARGET_PAGE_SIZE; > > > + > > > + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { > > > + if (virt_start >= iova->end) { > > > + continue; > > > + } > > > + > > > + /* Align addr */ > > > + new_start = ROUND_UP(MAX(virt_start, iova->start), addr_align); > > > + if (new_start >= iova->end) { > > > + continue; > > > + } > > > + > > > + if (req_size > iova->end - new_start + 1) { > > > + continue; > > > + } > > don't you want to check directly with new_size? > > Yes. I think that should do. > > > > + > > > + /* > > > + * Check the region can hold any size alignment requirement. > > > + */ > > > + new_size = QEMU_ALIGN_UP(req_size, sz_align); > > > + > > > + if ((new_start + new_size - 1 > iova->end) || > > > + (new_start + new_size >= virt_start + RAMLIMIT_BYTES)) { > > > + continue; > > > + } > > > + > > > + /* > > > + * Modify the iova list entry for non pc-dimm case so that it > > > + * is not used again for pc-dimm allocation. > > > + */ > > > + if (!pcdimm) { > > > + if (new_size - req_size) { > > I don't get this check, Don't you want to check the node's range is > > fully consumed and in the positive remove the node? > > > > (new_size != iova->end - iova-> start + 1) > > Hmm..looks like I missed that. > > > > + iova->start = new_start + new_size; > > > + } else { > > > + QLIST_REMOVE(iova, next); > > > + } > > > + } > > > + return new_start; > > > + } > > > + > > > + return -1; > > > +} > > > + > > > +static void update_memory_regions(VirtMachineState *vms) > > > +{ > > > + hwaddr addr; > > > + VFIOIovaRange *iova; > > > + MachineState *machine = MACHINE(vms); > > > + hwaddr virt_start = vms->memmap[VIRT_MEM].base; > > > + hwaddr req_size, ram_size = machine->ram_size; > > > + struct vfio_iova_head iova_copy = QLIST_HEAD_INITIALIZER(iova_copy); > > > + > > > + /* No valid iova regions, use default */ > > > + if (QLIST_EMPTY(&vfio_iova_regions)) { > > > + vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; > > > + vms->bootinfo.ram_size = ram_size; > > > + return; > > > + } > > > + > > > + /* > > > + * If valid iovas has only one entry, check the req size fits in > > > + * and can have the loader start < 4GB. This will make sure platforms > > > + * with no holes in mem will have the same mem model as before. > > > + */ > > > + req_size = ram_size; > > > + iova = QLIST_NEXT(QLIST_FIRST(&vfio_iova_regions), next); > > > + if (!iova) { > > > + iova = QLIST_FIRST(&vfio_iova_regions); > > > + addr = ROUND_UP(MAX(virt_start, iova->start), ALIGN_1G); > > > + if ((addr < 4 * SZ_1G) && (ram_size <= iova->end - addr + 1) && > > > + (addr + ram_size < virt_start + RAMLIMIT_BYTES)) { > > > + vms->bootinfo.loader_start = addr; > > > + vms->bootinfo.ram_size = ram_size; > > > + return; > > > + } > > > + } > > > + > > > + /* Get a copy of valid iovas and work on it */ > > > + get_iova_copy(&iova_copy); > > > + > > > + /* Split the mem as first 1GB non-pluggable and rest as pc-dimm */ > > > + req_size = MIN(ram_size, SZ_1G); > > > + addr = find_memory_chunk(vms, &iova_copy, req_size, false); > > According to what Drew reported, the base address of the cold-plug RAM > > must stay at 1GB otherwise the FW will be lost. So I think you must > > check the addr = 1GB > > Please see my above comment on the base address. > > > > + if (addr == -1 || addr >= 4 * SZ_1G) { > > > + goto out; > > > + } > > > + > > > + /*Update non-pluggable mem details */ > > > + machine->ram_size = req_size; > > > + vms->bootinfo.loader_start = addr; > > > + vms->bootinfo.ram_size = machine->ram_size; > > > + > > > + req_size = ram_size - req_size; > > > + if (!req_size) { > > > + goto done; > > > + } > > > + > > > + /* Remaining memory is modeled as a pc-dimm. */ > > > + addr = find_memory_chunk(vms, &iova_copy, req_size, true); > > > + if (addr == -1) { > > > + goto out; > > > + } > > > + > > > + /*Update pc-dimm mem details */ > > > + vms->bootinfo.dimm_mem = g_new(struct dimm_mem_info, 1); > > > + vms->bootinfo.dimm_mem->base = addr; > > > + vms->bootinfo.dimm_mem->size = req_size; > > > + machine->maxram_size = machine->ram_size + req_size; > > > + machine->ram_slots += 1; > > > + > > > +done: > > > + free_iova_copy(&iova_copy); > > > + return; > > > + > > > +out: > > > + free_iova_copy(&iova_copy); > > > + error_report("mach-virt: Not enough contiguous memory to model > ram"); > > Output the requested size would help debug (and maybe the available IOVA > > ranges) > > Agree. I will change that. > > Thanks, > Shameer > > [1] > https://github.com/tianocore/edk2/blob/master/ArmVirtPkg/ArmVirtQemu.dsc > #L133 > > > Thanks > > > > Eric > > > + exit(1); > > > +} > > > + > > > +static void create_pcdimms(VirtMachineState *vms, > > > + MemoryRegion *sysmem, > > > + MemoryRegion *ram) > > > +{ > > > + hwaddr addr, size; > > > + Error *local_err = NULL; > > > + QDict *qdict; > > > + QemuOpts *opts; > > > + char *tmp; > > > + > > > + if (!vms->bootinfo.dimm_mem) { > > > + return; > > > + } > > > + > > > + addr = vms->bootinfo.dimm_mem->base; > > > + size = vms->bootinfo.dimm_mem->size; > > > + > > > + /*Create hotplug address space */ > > > + vms->hotplug_memory.base = ROUND_UP(addr, ALIGN_1G); > > > + size = ROUND_UP(size, MAX(TARGET_PAGE_SIZE, > > QEMU_VMALLOC_ALIGN)); > > > + > > > + memory_region_init(&vms->hotplug_memory.mr, OBJECT(vms), > > > + "hotplug-memory", size); > > > + memory_region_add_subregion(sysmem, vms->hotplug_memory.base, > > > + &vms->hotplug_memory.mr); > > > + /* Create backend mem object */ > > > + qdict = qdict_new(); > > > + qdict_put_str(qdict, "qom-type", "memory-backend-ram"); > > > + qdict_put_str(qdict, "id", "mem1"); > > > + tmp = g_strdup_printf("%"PRIu64 "M", size / (1024 * 1024)); > > > + qdict_put_str(qdict, "size", tmp); > > > + g_free(tmp); > > > + > > > + opts = qemu_opts_from_qdict(qemu_find_opts("object"), qdict, > > &local_err); > > > + if (local_err) { > > > + goto err; > > > + } > > > + > > > + user_creatable_add_opts(opts, &local_err); > > > + qemu_opts_del(opts); > > > + QDECREF(qdict); > > > + if (local_err) { > > > + goto err; > > > + } > > > + > > > + /* Create pc-dimm dev*/ > > > + qdict = qdict_new(); > > > + qdict_put_str(qdict, "driver", "pc-dimm"); > > > + qdict_put_str(qdict, "id", "dimm1"); > > > + qdict_put_str(qdict, "memdev", "mem1"); > > > + > > > + opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, > > &local_err); > > > + if (local_err) { > > > + goto err; > > > + } > > > + > > > + qdev_device_add(opts, &local_err); > > > + qemu_opts_del(opts); > > > + QDECREF(qdict); > > > + if (local_err) { > > > + goto err; > > > + } > > > + > > > + return; > > > + > > > +err: > > > + error_report_err(local_err); > > > + exit(1); > > > +} > > > + > > > static void virt_ram_memory_region_init(Notifier *notifier, void *data) > > > { > > > MemoryRegion *sysmem = get_system_memory(); > > > @@ -1179,9 +1418,14 @@ static void > virt_ram_memory_region_init(Notifier > > *notifier, void *data) > > > ram_memory_region_init); > > > MachineState *machine = MACHINE(vms); > > > > > > + update_memory_regions(vms); > > > memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram", > > > machine->ram_size); > > > - memory_region_add_subregion(sysmem, vms- > > >memmap[VIRT_MEM].base, ram); > > > + memory_region_add_subregion(sysmem, vms->bootinfo.loader_start, > > ram); > > > + > > > + if (vms->bootinfo.dimm_mem) { > > > + create_pcdimms(vms, sysmem, ram); > > > + } > > > } > > > > > > static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx) > > > @@ -1404,13 +1648,11 @@ static void machvirt_init(MachineState > > *machine) > > > vms->machine_done.notify = virt_machine_done; > > > qemu_add_machine_init_done_notifier(&vms->machine_done); > > > > > > - vms->bootinfo.ram_size = machine->ram_size; > > > vms->bootinfo.kernel_filename = machine->kernel_filename; > > > vms->bootinfo.kernel_cmdline = machine->kernel_cmdline; > > > vms->bootinfo.initrd_filename = machine->initrd_filename; > > > vms->bootinfo.nb_cpus = smp_cpus; > > > vms->bootinfo.board_id = -1; > > > - vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; > > > vms->bootinfo.get_dtb = machvirt_dtb; > > > vms->bootinfo.firmware_loaded = firmware_loaded; > > > > > > @@ -1559,7 +1801,7 @@ static void virt_dimm_plug(HotplugHandler > > *hotplug_dev, > > > PCDIMMDevice *dimm = PC_DIMM(dev); > > > PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); > > > MemoryRegion *mr; > > > - uint64_t align; > > > + uint64_t align, addr; > > > Error *local_err = NULL; > > > > > > mr = ddc->get_memory_region(dimm, &local_err); > > > @@ -1573,6 +1815,15 @@ static void virt_dimm_plug(HotplugHandler > > *hotplug_dev, > > > goto out; > > > } > > > > > > + addr = object_property_get_uint(OBJECT(dimm), > > PC_DIMM_ADDR_PROP, > > > + &error_fatal); > > > + /* Assign the node for pc-dimm initial ram */ > > > + if (vms->bootinfo.dimm_mem && (addr == vms->bootinfo.dimm_mem- > > >base) > > > + && (nb_numa_nodes > 0)) { > > > + vms->bootinfo.dimm_mem->node = > > object_property_get_uint(OBJECT(dev), > > > + PC_DIMM_NODE_PROP, &error_fatal); > > > + } > > > + > > > out: > > > error_propagate(errp, local_err); > > > } > > >
On Tue, Jun 05, 2018 at 07:49:44AM +0000, Shameerali Kolothum Thodi wrote: > > > On 05/16/2018 05:20 PM, Shameer Kolothum wrote: > > > > In case valid iova regions are non-contiguous, split the > > > > RAM mem into a 1GB non-pluggable dimm and remaining as a > > > > single pc-dimm mem. > > > > > > Please can you explain where does this split come from? Currently we > > > have 254 GB non pluggable RAM. I read the discussion started with Drew > > > on RFC v1 where he explained we cannot change the RAM base without > > > crashing the FW. However we should at least document this 1GB split. > > > > The comments were mainly to use the pc-dimm model instead of "mem alias" > > method used on RFC v1 as this will help to add the mem hot-plug support > > in future. > > > > I am not sure about the motive behind Drew's idea of splitting the first > > 1GB as non-plug and remaining as a pc-dimm(cold). May it is to attempt a > > best effort scenario, but as I mentioned in reply to 0/6, the current solution > > will end up changing the base address if the 0x4000000 falls under a reserved > > region. > > > > Again, not sure whether we should enforce a strict check on base address > > start or just warn the user that it will fail on Guest with UEFI boot[1]. > > > > Hi Drew, > > > > Please let me know your thoughts on this. > > Could you please take a look at the above discussion and let us > know your thoughts on the split of mem regions as 1GB non-pluggable > and rest as pc-dimm. > Hi Shameer, Sorry for the slow reply - I've been slowly catching back up after vacation. There are two reasons to have one non-pluggable memory region and the rest of memory in a single pluggable pc-dimm. 1) For mach-virt we must have the base of memory at the 1G boundary, both because otherwise we'll break ArmVirt UEFI and because that's a guarantee that Peter has stated he'd like to keep for mach-virt. 2) Memory split in this way already has precedent in the x86 PC machine models. It's debatable how much memory we should allocate to the non-pluggable region. It doesn't need to be 1G, it could be smaller. The default memory size allocated to a mach-virt guest that doesn't provide '-m' on the command line is 128M. Maybe we should use that size? If 0x40000000 falls into a reserved address region on some host, then I'm afraid the mach-virt model won't work with those devices unless the guest doesn't use UEFI and Peter is open to providing a machine property that one can enable in order to move the base address of memory. I know Eric is looking at this problem too. I hope you guys are coordinating your efforts. Thanks, drew
On 15 June 2018 at 16:44, Andrew Jones <drjones@redhat.com> wrote: > If 0x40000000 falls into a reserved address region on some host, then > I'm afraid the mach-virt model won't work with those devices unless the > guest doesn't use UEFI and Peter is open to providing a machine property > that one can enable in order to move the base address of memory. Why should the VM ever care about where the address regions in the host happen to be when it comes to where it's putting its RAM in the VM's address layout? I feel like I'm missing something here... thanks -- PMM
Hi Drew, On 06/15/2018 05:44 PM, Andrew Jones wrote: > On Tue, Jun 05, 2018 at 07:49:44AM +0000, Shameerali Kolothum Thodi wrote: >>>> On 05/16/2018 05:20 PM, Shameer Kolothum wrote: >>>>> In case valid iova regions are non-contiguous, split the >>>>> RAM mem into a 1GB non-pluggable dimm and remaining as a >>>>> single pc-dimm mem. >>>> >>>> Please can you explain where does this split come from? Currently we >>>> have 254 GB non pluggable RAM. I read the discussion started with Drew >>>> on RFC v1 where he explained we cannot change the RAM base without >>>> crashing the FW. However we should at least document this 1GB split. >>> >>> The comments were mainly to use the pc-dimm model instead of "mem alias" >>> method used on RFC v1 as this will help to add the mem hot-plug support >>> in future. >>> >>> I am not sure about the motive behind Drew's idea of splitting the first >>> 1GB as non-plug and remaining as a pc-dimm(cold). May it is to attempt a >>> best effort scenario, but as I mentioned in reply to 0/6, the current solution >>> will end up changing the base address if the 0x4000000 falls under a reserved >>> region. >>> >>> Again, not sure whether we should enforce a strict check on base address >>> start or just warn the user that it will fail on Guest with UEFI boot[1]. >>> >>> Hi Drew, >>> >>> Please let me know your thoughts on this. >> >> Could you please take a look at the above discussion and let us >> know your thoughts on the split of mem regions as 1GB non-pluggable >> and rest as pc-dimm. >> > > Hi Shameer, > > Sorry for the slow reply - I've been slowly catching back up after > vacation. There are two reasons to have one non-pluggable memory region > and the rest of memory in a single pluggable pc-dimm. > > 1) For mach-virt we must have the base of memory at the 1G boundary, > both because otherwise we'll break ArmVirt UEFI and because that's > a guarantee that Peter has stated he'd like to keep for mach-virt. > > 2) Memory split in this way already has precedent in the x86 PC > machine models. > > It's debatable how much memory we should allocate to the non-pluggable > region. It doesn't need to be 1G, it could be smaller. The default > memory size allocated to a mach-virt guest that doesn't provide '-m' > on the command line is 128M. Maybe we should use that size? > > If 0x40000000 falls into a reserved address region on some host, then > I'm afraid the mach-virt model won't work with those devices unless the > guest doesn't use UEFI and Peter is open to providing a machine property > that one can enable in order to move the base address of memory. > > I know Eric is looking at this problem too. I hope you guys are > coordinating your efforts. Yes we sync'ed together. I will send an RFC beginning of next week addressing both - support of >40b PA VM (based on Suzuki's series) - addition of DIMM at 2TB, reusing the standard PC-DIMM hotplug framework. This is something standard and similar to what is done on PC Q35. I am reusing some of Shameer's patches, rebased on top of Igor and David recent works. Then we need to understand how we can use 2 hotplug regions. This is not obvsious to me as the framework currently uses a dedicated MemoryRegion, if I understand it correctly. Thanks Eric > > Thanks, > drew >
Hi, On 06/15/2018 05:54 PM, Peter Maydell wrote: > Why should the VM ever care about where the address regions in the > host happen to be when it comes to where it's putting its RAM > in the VM's address layout? I feel like I'm missing something here... > > thanks The VM cannot use RAM GPA that matches assigned device reserved IOVA regions. When you assign a device, the whole VM RAM is mapped in the physical IOMMU and IOVA corresponds to GPA. but sometimes some IOVA cannot be used due to the host specificities and especially iommu peculiarities. Some IOVAs may be simply "bypassed" by the iommu, like on x86 and also with some ARM SMMU integration. In such a case the DMA accesses have no chance to reach the guest RAM as expected. Hope it clarifies. Thanks Eric
On 15 June 2018 at 17:13, Auger Eric <eric.auger@redhat.com> wrote: > On 06/15/2018 05:54 PM, Peter Maydell wrote: >> Why should the VM ever care about where the address regions in the >> host happen to be when it comes to where it's putting its RAM >> in the VM's address layout? I feel like I'm missing something here... > The VM cannot use RAM GPA that matches assigned device reserved IOVA > regions. When you assign a device, the whole VM RAM is mapped in the > physical IOMMU and IOVA corresponds to GPA. but sometimes some IOVA > cannot be used due to the host specificities and especially iommu > peculiarities. Some IOVAs may be simply "bypassed" by the iommu, like on > x86 and also with some ARM SMMU integration. In such a case the DMA > accesses have no chance to reach the guest RAM as expected. Hope it > clarifies. Hmm. We don't want to have the address layout of the VM have to cope with all the various oddities that host systems might have. Is this kind of thing common, or rare? I'd much rather just say "we don't support device passthrough on that sort of system". thanks -- PMM
> -----Original Message----- > From: Peter Maydell [mailto:peter.maydell@linaro.org] > Sent: 15 June 2018 17:33 > To: Auger Eric <eric.auger@redhat.com> > Cc: Andrew Jones <drjones@redhat.com>; Shameerali Kolothum Thodi > <shameerali.kolothum.thodi@huawei.com>; qemu-devel@nongnu.org; qemu- > arm@nongnu.org; Jonathan Cameron <jonathan.cameron@huawei.com>; > Linuxarm <linuxarm@huawei.com>; alex.williamson@redhat.com; > Zhaoshenglong <zhaoshenglong@huawei.com>; imammedo@redhat.com > Subject: Re: [Qemu-devel] [RFC v2 6/6] hw/arm: Populate non-contiguous > memory regions > > On 15 June 2018 at 17:13, Auger Eric <eric.auger@redhat.com> wrote: > > On 06/15/2018 05:54 PM, Peter Maydell wrote: > >> Why should the VM ever care about where the address regions in the > >> host happen to be when it comes to where it's putting its RAM > >> in the VM's address layout? I feel like I'm missing something here... > > > The VM cannot use RAM GPA that matches assigned device reserved IOVA > > regions. When you assign a device, the whole VM RAM is mapped in the > > physical IOMMU and IOVA corresponds to GPA. but sometimes some IOVA > > cannot be used due to the host specificities and especially iommu > > peculiarities. Some IOVAs may be simply "bypassed" by the iommu, like on > > x86 and also with some ARM SMMU integration. In such a case the DMA > > accesses have no chance to reach the guest RAM as expected. Hope it > > clarifies. > > Hmm. We don't want to have the address layout of > the VM have to cope with all the various oddities that host > systems might have. Is this kind of thing common, or rare? > I'd much rather just say "we don't support device passthrough > on that sort of system". > As far as I know on ARM64 platforms this is not very rare mainly because of the phys MSI doorbell and the fact that memory map is not standardized. This was discussed in LPC sometime back[1] and a sysfs interface was provided to report these reserved regions. And later Alex commented that vfio needs a more robust way of identifying and reporting these regions and that’s how the vfio kernel patch series[2] was introduced on which this qemu series is based on. Thanks, Shameer [1] https://lkml.org/lkml/2016/11/7/935 [2] https://lkml.org/lkml/2018/4/18/293
diff --git a/hw/arm/virt.c b/hw/arm/virt.c index be3ad14..562e389 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -58,6 +58,12 @@ #include "hw/smbios/smbios.h" #include "qapi/visitor.h" #include "standard-headers/linux/input.h" +#include "hw/vfio/vfio-common.h" +#include "qemu/config-file.h" +#include "monitor/qdev.h" +#include "qom/object_interfaces.h" +#include "qapi/qmp/qdict.h" +#include "qemu/option.h" #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ @@ -110,7 +116,10 @@ static ARMPlatformBusSystemParams platform_bus_params; * terabyte of physical address space.) */ #define RAMLIMIT_GB 255 -#define RAMLIMIT_BYTES (RAMLIMIT_GB * 1024ULL * 1024 * 1024) +#define SZ_1G (1024ULL * 1024 * 1024) +#define RAMLIMIT_BYTES (RAMLIMIT_GB * SZ_1G) + +#define ALIGN_1G (1ULL << 30) /* Addresses and sizes of our components. * 0..128MB is space for a flash device so we can run bootrom code such as UEFI. @@ -1171,6 +1180,236 @@ void virt_machine_done(Notifier *notifier, void *data) virt_build_smbios(vms); } +static void free_iova_copy(struct vfio_iova_head *iova_copy) +{ + VFIOIovaRange *iova, *tmp; + + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { + QLIST_REMOVE(iova, next); + g_free(iova); + } +} + +static void get_iova_copy(struct vfio_iova_head *iova_copy) +{ + VFIOIovaRange *iova, *new, *prev_iova = NULL; + + QLIST_FOREACH(iova, &vfio_iova_regions, next) { + new = g_malloc0(sizeof(*iova)); + new->start = iova->start; + new->end = iova->end; + + if (prev_iova) { + QLIST_INSERT_AFTER(prev_iova, new, next); + } else { + QLIST_INSERT_HEAD(iova_copy, new, next); + } + prev_iova = new; + } +} + +static hwaddr find_memory_chunk(VirtMachineState *vms, + struct vfio_iova_head *iova_copy, + hwaddr req_size, bool pcdimm) +{ + VFIOIovaRange *iova, *tmp; + hwaddr new_start, new_size, sz_align; + hwaddr virt_start = vms->memmap[VIRT_MEM].base; + hwaddr addr_align = ALIGN_1G; /* Set to max ARM64 hugepage size */ + + /* Size alignment */ + sz_align = (pcdimm) ? MAX(TARGET_PAGE_SIZE, QEMU_VMALLOC_ALIGN) : + TARGET_PAGE_SIZE; + + QLIST_FOREACH_SAFE(iova, iova_copy, next, tmp) { + if (virt_start >= iova->end) { + continue; + } + + /* Align addr */ + new_start = ROUND_UP(MAX(virt_start, iova->start), addr_align); + if (new_start >= iova->end) { + continue; + } + + if (req_size > iova->end - new_start + 1) { + continue; + } + + /* + * Check the region can hold any size alignment requirement. + */ + new_size = QEMU_ALIGN_UP(req_size, sz_align); + + if ((new_start + new_size - 1 > iova->end) || + (new_start + new_size >= virt_start + RAMLIMIT_BYTES)) { + continue; + } + + /* + * Modify the iova list entry for non pc-dimm case so that it + * is not used again for pc-dimm allocation. + */ + if (!pcdimm) { + if (new_size - req_size) { + iova->start = new_start + new_size; + } else { + QLIST_REMOVE(iova, next); + } + } + return new_start; + } + + return -1; +} + +static void update_memory_regions(VirtMachineState *vms) +{ + hwaddr addr; + VFIOIovaRange *iova; + MachineState *machine = MACHINE(vms); + hwaddr virt_start = vms->memmap[VIRT_MEM].base; + hwaddr req_size, ram_size = machine->ram_size; + struct vfio_iova_head iova_copy = QLIST_HEAD_INITIALIZER(iova_copy); + + /* No valid iova regions, use default */ + if (QLIST_EMPTY(&vfio_iova_regions)) { + vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; + vms->bootinfo.ram_size = ram_size; + return; + } + + /* + * If valid iovas has only one entry, check the req size fits in + * and can have the loader start < 4GB. This will make sure platforms + * with no holes in mem will have the same mem model as before. + */ + req_size = ram_size; + iova = QLIST_NEXT(QLIST_FIRST(&vfio_iova_regions), next); + if (!iova) { + iova = QLIST_FIRST(&vfio_iova_regions); + addr = ROUND_UP(MAX(virt_start, iova->start), ALIGN_1G); + if ((addr < 4 * SZ_1G) && (ram_size <= iova->end - addr + 1) && + (addr + ram_size < virt_start + RAMLIMIT_BYTES)) { + vms->bootinfo.loader_start = addr; + vms->bootinfo.ram_size = ram_size; + return; + } + } + + /* Get a copy of valid iovas and work on it */ + get_iova_copy(&iova_copy); + + /* Split the mem as first 1GB non-pluggable and rest as pc-dimm */ + req_size = MIN(ram_size, SZ_1G); + addr = find_memory_chunk(vms, &iova_copy, req_size, false); + if (addr == -1 || addr >= 4 * SZ_1G) { + goto out; + } + + /*Update non-pluggable mem details */ + machine->ram_size = req_size; + vms->bootinfo.loader_start = addr; + vms->bootinfo.ram_size = machine->ram_size; + + req_size = ram_size - req_size; + if (!req_size) { + goto done; + } + + /* Remaining memory is modeled as a pc-dimm. */ + addr = find_memory_chunk(vms, &iova_copy, req_size, true); + if (addr == -1) { + goto out; + } + + /*Update pc-dimm mem details */ + vms->bootinfo.dimm_mem = g_new(struct dimm_mem_info, 1); + vms->bootinfo.dimm_mem->base = addr; + vms->bootinfo.dimm_mem->size = req_size; + machine->maxram_size = machine->ram_size + req_size; + machine->ram_slots += 1; + +done: + free_iova_copy(&iova_copy); + return; + +out: + free_iova_copy(&iova_copy); + error_report("mach-virt: Not enough contiguous memory to model ram"); + exit(1); +} + +static void create_pcdimms(VirtMachineState *vms, + MemoryRegion *sysmem, + MemoryRegion *ram) +{ + hwaddr addr, size; + Error *local_err = NULL; + QDict *qdict; + QemuOpts *opts; + char *tmp; + + if (!vms->bootinfo.dimm_mem) { + return; + } + + addr = vms->bootinfo.dimm_mem->base; + size = vms->bootinfo.dimm_mem->size; + + /*Create hotplug address space */ + vms->hotplug_memory.base = ROUND_UP(addr, ALIGN_1G); + size = ROUND_UP(size, MAX(TARGET_PAGE_SIZE, QEMU_VMALLOC_ALIGN)); + + memory_region_init(&vms->hotplug_memory.mr, OBJECT(vms), + "hotplug-memory", size); + memory_region_add_subregion(sysmem, vms->hotplug_memory.base, + &vms->hotplug_memory.mr); + /* Create backend mem object */ + qdict = qdict_new(); + qdict_put_str(qdict, "qom-type", "memory-backend-ram"); + qdict_put_str(qdict, "id", "mem1"); + tmp = g_strdup_printf("%"PRIu64 "M", size / (1024 * 1024)); + qdict_put_str(qdict, "size", tmp); + g_free(tmp); + + opts = qemu_opts_from_qdict(qemu_find_opts("object"), qdict, &local_err); + if (local_err) { + goto err; + } + + user_creatable_add_opts(opts, &local_err); + qemu_opts_del(opts); + QDECREF(qdict); + if (local_err) { + goto err; + } + + /* Create pc-dimm dev*/ + qdict = qdict_new(); + qdict_put_str(qdict, "driver", "pc-dimm"); + qdict_put_str(qdict, "id", "dimm1"); + qdict_put_str(qdict, "memdev", "mem1"); + + opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, &local_err); + if (local_err) { + goto err; + } + + qdev_device_add(opts, &local_err); + qemu_opts_del(opts); + QDECREF(qdict); + if (local_err) { + goto err; + } + + return; + +err: + error_report_err(local_err); + exit(1); +} + static void virt_ram_memory_region_init(Notifier *notifier, void *data) { MemoryRegion *sysmem = get_system_memory(); @@ -1179,9 +1418,14 @@ static void virt_ram_memory_region_init(Notifier *notifier, void *data) ram_memory_region_init); MachineState *machine = MACHINE(vms); + update_memory_regions(vms); memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram", machine->ram_size); - memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram); + memory_region_add_subregion(sysmem, vms->bootinfo.loader_start, ram); + + if (vms->bootinfo.dimm_mem) { + create_pcdimms(vms, sysmem, ram); + } } static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx) @@ -1404,13 +1648,11 @@ static void machvirt_init(MachineState *machine) vms->machine_done.notify = virt_machine_done; qemu_add_machine_init_done_notifier(&vms->machine_done); - vms->bootinfo.ram_size = machine->ram_size; vms->bootinfo.kernel_filename = machine->kernel_filename; vms->bootinfo.kernel_cmdline = machine->kernel_cmdline; vms->bootinfo.initrd_filename = machine->initrd_filename; vms->bootinfo.nb_cpus = smp_cpus; vms->bootinfo.board_id = -1; - vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; vms->bootinfo.get_dtb = machvirt_dtb; vms->bootinfo.firmware_loaded = firmware_loaded; @@ -1559,7 +1801,7 @@ static void virt_dimm_plug(HotplugHandler *hotplug_dev, PCDIMMDevice *dimm = PC_DIMM(dev); PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); MemoryRegion *mr; - uint64_t align; + uint64_t align, addr; Error *local_err = NULL; mr = ddc->get_memory_region(dimm, &local_err); @@ -1573,6 +1815,15 @@ static void virt_dimm_plug(HotplugHandler *hotplug_dev, goto out; } + addr = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP, + &error_fatal); + /* Assign the node for pc-dimm initial ram */ + if (vms->bootinfo.dimm_mem && (addr == vms->bootinfo.dimm_mem->base) + && (nb_numa_nodes > 0)) { + vms->bootinfo.dimm_mem->node = object_property_get_uint(OBJECT(dev), + PC_DIMM_NODE_PROP, &error_fatal); + } + out: error_propagate(errp, local_err); }
In case valid iova regions are non-contiguous, split the RAM mem into a 1GB non-pluggable dimm and remaining as a single pc-dimm mem. Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> --- hw/arm/virt.c | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 256 insertions(+), 5 deletions(-)