diff mbox

[v2,1/3] hw/acpi-build: build SRAT memory affinity structures for DIMM devices

Message ID 20180228040300.8914-2-haozhong.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Haozhong Zhang Feb. 28, 2018, 4:02 a.m. UTC
ACPI 6.2A Table 5-129 "SPA Range Structure" requires the proximity
domain of a NVDIMM SPA range must match with corresponding entry in
SRAT table.

The address ranges of vNVDIMM in QEMU are allocated from the
hot-pluggable address space, which is entirely covered by one SRAT
memory affinity structure. However, users can set the vNVDIMM
proximity domain in NFIT SPA range structure by the 'node' property of
'-device nvdimm' to a value different than the one in the above SRAT
memory affinity structure.

In order to solve such proximity domain mismatch, this patch builds
one SRAT memory affinity structure for each static-plugged DIMM device,
including both PC-DIMM and NVDIMM, with the proximity domain specified
in '-device pc-dimm' or '-device nvdimm'.

The remaining hot-pluggable address space is covered by one or multiple
SRAT memory affinity structures with the proximity domain of the last
node as before.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
 hw/i386/acpi-build.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++----
 hw/mem/pc-dimm.c         |  8 ++++++++
 include/hw/mem/pc-dimm.h | 10 ++++++++++
 3 files changed, 64 insertions(+), 4 deletions(-)

Comments

Igor Mammedov March 1, 2018, 10:42 a.m. UTC | #1
On Wed, 28 Feb 2018 12:02:58 +0800
Haozhong Zhang <haozhong.zhang@intel.com> wrote:

> ACPI 6.2A Table 5-129 "SPA Range Structure" requires the proximity
> domain of a NVDIMM SPA range must match with corresponding entry in
> SRAT table.
> 
> The address ranges of vNVDIMM in QEMU are allocated from the
> hot-pluggable address space, which is entirely covered by one SRAT
> memory affinity structure. However, users can set the vNVDIMM
> proximity domain in NFIT SPA range structure by the 'node' property of
> '-device nvdimm' to a value different than the one in the above SRAT
> memory affinity structure.
> 
> In order to solve such proximity domain mismatch, this patch builds
> one SRAT memory affinity structure for each static-plugged DIMM device,
s/static-plugged/present at boot/
since after hotplug and following reset SRAT will be recreated
and include hotplugged DIMMs as well.

> including both PC-DIMM and NVDIMM, with the proximity domain specified
> in '-device pc-dimm' or '-device nvdimm'.
> 
> The remaining hot-pluggable address space is covered by one or multiple
> SRAT memory affinity structures with the proximity domain of the last
> node as before.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> ---
>  hw/i386/acpi-build.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++----
>  hw/mem/pc-dimm.c         |  8 ++++++++
>  include/hw/mem/pc-dimm.h | 10 ++++++++++
>  3 files changed, 64 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index deb440f286..a88de06d8f 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -2323,6 +2323,49 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
>  #define HOLE_640K_START  (640 * 1024)
>  #define HOLE_640K_END   (1024 * 1024)
>  
> +static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base,
> +                                           uint64_t len, int default_node)
> +{
> +    GSList *dimms = pc_dimm_get_device_list();
> +    GSList *ent = dimms;
> +    PCDIMMDevice *dev;
> +    Object *obj;
> +    uint64_t end = base + len, addr, size;
> +    int node;
> +    AcpiSratMemoryAffinity *numamem;
> +
> +    while (base < end) {
It's just matter of taste but wouldn't 'for' loop be better here?
One can see start, end and next step from the begging.

> +        numamem = acpi_data_push(table_data, sizeof *numamem);
> +
> +        if (!ent) {
> +            build_srat_memory(numamem, base, end - base, default_node,
> +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> +            break;
> +        }
> +
> +        dev = PC_DIMM(ent->data);
> +        obj = OBJECT(dev);
> +        addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, NULL);
> +        size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL);
> +        node = object_property_get_uint(obj, PC_DIMM_NODE_PROP, NULL);
> +
> +        if (base < addr) {
> +            build_srat_memory(numamem, base, addr - base, default_node,
> +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> +            numamem = acpi_data_push(table_data, sizeof *numamem);
> +        }
> +        build_srat_memory(numamem, addr, size, node,
> +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED |
Is NVDIMM hotplug supported in QEMU?
If not we might need make MEM_AFFINITY_HOTPLUGGABLE conditional too.

> +                          (object_dynamic_cast(obj, TYPE_NVDIMM) ?
> +                           MEM_AFFINITY_NON_VOLATILE : 0));
it might be cleaner without inline flags duplication

  flags = MEM_AFFINITY_ENABLED;
  ...
  if (!ent) {
      flags |= MEM_AFFINITY_HOTPLUGGABLE
  }
  ...
  if (PCDIMMDeviceInfo::hotpluggable) { // see ***
      flags |= MEM_AFFINITY_HOTPLUGGABLE
  }
  ...
  if (object_dynamic_cast(obj, TYPE_NVDIMM))
      flags |= MEM_AFFINITY_NON_VOLATILE
  }

> +
> +        base = addr + size;
> +        ent = g_slist_next(ent);
> +    }
> +
> +    g_slist_free(dimms);
> +}
> +
>  static void
>  build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
>  {
> @@ -2434,10 +2477,9 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
>       * providing _PXM method if necessary.
>       */
>      if (hotplugabble_address_space_size) {
> -        numamem = acpi_data_push(table_data, sizeof *numamem);
> -        build_srat_memory(numamem, pcms->hotplug_memory.base,
> -                          hotplugabble_address_space_size, pcms->numa_nodes - 1,
> -                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> +        build_srat_hotpluggable_memory(table_data, pcms->hotplug_memory.base,
> +                                       hotplugabble_address_space_size,
> +                                       pcms->numa_nodes - 1);
>      }
>  
>      build_header(linker, table_data,
> diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
> index 6e74b61cb6..9fd901e87a 100644
> --- a/hw/mem/pc-dimm.c
> +++ b/hw/mem/pc-dimm.c
> @@ -276,6 +276,14 @@ static int pc_dimm_built_list(Object *obj, void *opaque)
>      return 0;
>  }
>  
> +GSList *pc_dimm_get_device_list(void)
> +{
> +    GSList *list = NULL;
> +
> +    object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &list);
> +    return list;
> +}
(***)
see http://lists.gnu.org/archive/html/qemu-ppc/2018-02/msg00271.html
You could do that in separate patch, so that it won't matter
whose patch got merged first and it won't affect the rest of patches.


>  uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
>                                 uint64_t address_space_size,
>                                 uint64_t *hint, uint64_t align, uint64_t size,
> diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
> index d83b957829..4cf5cc49e9 100644
> --- a/include/hw/mem/pc-dimm.h
> +++ b/include/hw/mem/pc-dimm.h
> @@ -100,4 +100,14 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
>                           MemoryRegion *mr, uint64_t align, Error **errp);
>  void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
>                             MemoryRegion *mr);
> +
> +/*
> + * Return a list of DeviceState of pc-dimm and nvdimm devices. The
> + * list is sorted in the ascendant order of the base address of
> + * devices.
> + *
> + * Note: callers are responsible to free the list.
> + */
> +GSList *pc_dimm_get_device_list(void);
> +
>  #endif
Haozhong Zhang March 1, 2018, 11:56 a.m. UTC | #2
On 03/01/18 11:42 +0100, Igor Mammedov wrote:
> On Wed, 28 Feb 2018 12:02:58 +0800
> Haozhong Zhang <haozhong.zhang@intel.com> wrote:
> 
> > ACPI 6.2A Table 5-129 "SPA Range Structure" requires the proximity
> > domain of a NVDIMM SPA range must match with corresponding entry in
> > SRAT table.
> > 
> > The address ranges of vNVDIMM in QEMU are allocated from the
> > hot-pluggable address space, which is entirely covered by one SRAT
> > memory affinity structure. However, users can set the vNVDIMM
> > proximity domain in NFIT SPA range structure by the 'node' property of
> > '-device nvdimm' to a value different than the one in the above SRAT
> > memory affinity structure.
> > 
> > In order to solve such proximity domain mismatch, this patch builds
> > one SRAT memory affinity structure for each static-plugged DIMM device,
> s/static-plugged/present at boot/
> since after hotplug and following reset SRAT will be recreated
> and include hotplugged DIMMs as well.

Ah yes, I'll fix the message in the next version.

> 
> > including both PC-DIMM and NVDIMM, with the proximity domain specified
> > in '-device pc-dimm' or '-device nvdimm'.
> > 
> > The remaining hot-pluggable address space is covered by one or multiple
> > SRAT memory affinity structures with the proximity domain of the last
> > node as before.
> > 
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > ---
> >  hw/i386/acpi-build.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++----
> >  hw/mem/pc-dimm.c         |  8 ++++++++
> >  include/hw/mem/pc-dimm.h | 10 ++++++++++
> >  3 files changed, 64 insertions(+), 4 deletions(-)
> > 
> > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > index deb440f286..a88de06d8f 100644
> > --- a/hw/i386/acpi-build.c
> > +++ b/hw/i386/acpi-build.c
> > @@ -2323,6 +2323,49 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
> >  #define HOLE_640K_START  (640 * 1024)
> >  #define HOLE_640K_END   (1024 * 1024)
> >  
> > +static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base,
> > +                                           uint64_t len, int default_node)
> > +{
> > +    GSList *dimms = pc_dimm_get_device_list();
> > +    GSList *ent = dimms;
> > +    PCDIMMDevice *dev;
> > +    Object *obj;
> > +    uint64_t end = base + len, addr, size;
> > +    int node;
> > +    AcpiSratMemoryAffinity *numamem;
> > +
> > +    while (base < end) {
> It's just matter of taste but wouldn't 'for' loop be better here?
> One can see start, end and next step from the begging.

will switch to a for loop

> 
> > +        numamem = acpi_data_push(table_data, sizeof *numamem);
> > +
> > +        if (!ent) {
> > +            build_srat_memory(numamem, base, end - base, default_node,
> > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > +            break;
> > +        }
> > +
> > +        dev = PC_DIMM(ent->data);
> > +        obj = OBJECT(dev);
> > +        addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, NULL);
> > +        size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL);
> > +        node = object_property_get_uint(obj, PC_DIMM_NODE_PROP, NULL);
> > +
> > +        if (base < addr) {
> > +            build_srat_memory(numamem, base, addr - base, default_node,
> > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > +            numamem = acpi_data_push(table_data, sizeof *numamem);
> > +        }
> > +        build_srat_memory(numamem, addr, size, node,
> > +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED |
> Is NVDIMM hotplug supported in QEMU?
> If not we might need make MEM_AFFINITY_HOTPLUGGABLE conditional too.

Yes, it's supported.

> 
> > +                          (object_dynamic_cast(obj, TYPE_NVDIMM) ?
> > +                           MEM_AFFINITY_NON_VOLATILE : 0));
> it might be cleaner without inline flags duplication
> 
>   flags = MEM_AFFINITY_ENABLED;
>   ...
>   if (!ent) {
>       flags |= MEM_AFFINITY_HOTPLUGGABLE
>   }
>   ...
>   if (PCDIMMDeviceInfo::hotpluggable) { // see ***
>       flags |= MEM_AFFINITY_HOTPLUGGABLE
>   }
>   ...
>   if (object_dynamic_cast(obj, TYPE_NVDIMM))
>       flags |= MEM_AFFINITY_NON_VOLATILE
>   }

I'm fine for such changes, except ***

[..]
> > diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
> > index 6e74b61cb6..9fd901e87a 100644
> > --- a/hw/mem/pc-dimm.c
> > +++ b/hw/mem/pc-dimm.c
> > @@ -276,6 +276,14 @@ static int pc_dimm_built_list(Object *obj, void *opaque)
> >      return 0;
> >  }
> >  
> > +GSList *pc_dimm_get_device_list(void)
> > +{
> > +    GSList *list = NULL;
> > +
> > +    object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &list);
> > +    return list;
> > +}
> (***)
> see http://lists.gnu.org/archive/html/qemu-ppc/2018-02/msg00271.html
> You could do that in separate patch, so that it won't matter
> whose patch got merged first and it won't affect the rest of patches.

Sure, I can separate this part, but I would still like to use a list
of PCDIMMDevice rather than a list of MemoryDeviceInfo. The latter
would need to be extended to include NVDIMM information (e.g., adding
a NVDIMMDeviceInfo to the union).

Haozhong

> 
> 
> >  uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
> >                                 uint64_t address_space_size,
> >                                 uint64_t *hint, uint64_t align, uint64_t size,
> > diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
> > index d83b957829..4cf5cc49e9 100644
> > --- a/include/hw/mem/pc-dimm.h
> > +++ b/include/hw/mem/pc-dimm.h
> > @@ -100,4 +100,14 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
> >                           MemoryRegion *mr, uint64_t align, Error **errp);
> >  void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
> >                             MemoryRegion *mr);
> > +
> > +/*
> > + * Return a list of DeviceState of pc-dimm and nvdimm devices. The
> > + * list is sorted in the ascendant order of the base address of
> > + * devices.
> > + *
> > + * Note: callers are responsible to free the list.
> > + */
> > +GSList *pc_dimm_get_device_list(void);
> > +
> >  #endif
>
Igor Mammedov March 1, 2018, 1:01 p.m. UTC | #3
On Thu, 1 Mar 2018 19:56:51 +0800
Haozhong Zhang <haozhong.zhang@intel.com> wrote:

> On 03/01/18 11:42 +0100, Igor Mammedov wrote:
> > On Wed, 28 Feb 2018 12:02:58 +0800
> > Haozhong Zhang <haozhong.zhang@intel.com> wrote:
> >   
> > > ACPI 6.2A Table 5-129 "SPA Range Structure" requires the proximity
> > > domain of a NVDIMM SPA range must match with corresponding entry in
> > > SRAT table.
> > > 
> > > The address ranges of vNVDIMM in QEMU are allocated from the
> > > hot-pluggable address space, which is entirely covered by one SRAT
> > > memory affinity structure. However, users can set the vNVDIMM
> > > proximity domain in NFIT SPA range structure by the 'node' property of
> > > '-device nvdimm' to a value different than the one in the above SRAT
> > > memory affinity structure.
> > > 
> > > In order to solve such proximity domain mismatch, this patch builds
> > > one SRAT memory affinity structure for each static-plugged DIMM device,  
> > s/static-plugged/present at boot/
> > since after hotplug and following reset SRAT will be recreated
> > and include hotplugged DIMMs as well.  
> 
> Ah yes, I'll fix the message in the next version.
> 
> >   
> > > including both PC-DIMM and NVDIMM, with the proximity domain specified
> > > in '-device pc-dimm' or '-device nvdimm'.
> > > 
> > > The remaining hot-pluggable address space is covered by one or multiple
> > > SRAT memory affinity structures with the proximity domain of the last
> > > node as before.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > ---
> > >  hw/i386/acpi-build.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++----
> > >  hw/mem/pc-dimm.c         |  8 ++++++++
> > >  include/hw/mem/pc-dimm.h | 10 ++++++++++
> > >  3 files changed, 64 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > > index deb440f286..a88de06d8f 100644
> > > --- a/hw/i386/acpi-build.c
> > > +++ b/hw/i386/acpi-build.c
> > > @@ -2323,6 +2323,49 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
> > >  #define HOLE_640K_START  (640 * 1024)
> > >  #define HOLE_640K_END   (1024 * 1024)
> > >  
> > > +static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base,
> > > +                                           uint64_t len, int default_node)
> > > +{
> > > +    GSList *dimms = pc_dimm_get_device_list();
> > > +    GSList *ent = dimms;
> > > +    PCDIMMDevice *dev;
> > > +    Object *obj;
> > > +    uint64_t end = base + len, addr, size;
> > > +    int node;
> > > +    AcpiSratMemoryAffinity *numamem;
> > > +
> > > +    while (base < end) {  
> > It's just matter of taste but wouldn't 'for' loop be better here?
> > One can see start, end and next step from the begging.  
> 
> will switch to a for loop
> 
> >   
> > > +        numamem = acpi_data_push(table_data, sizeof *numamem);
> > > +
> > > +        if (!ent) {
> > > +            build_srat_memory(numamem, base, end - base, default_node,
> > > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > > +            break;
> > > +        }
> > > +
> > > +        dev = PC_DIMM(ent->data);
> > > +        obj = OBJECT(dev);
> > > +        addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, NULL);
> > > +        size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL);
> > > +        node = object_property_get_uint(obj, PC_DIMM_NODE_PROP, NULL);
> > > +
> > > +        if (base < addr) {
> > > +            build_srat_memory(numamem, base, addr - base, default_node,
> > > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > > +            numamem = acpi_data_push(table_data, sizeof *numamem);
> > > +        }
> > > +        build_srat_memory(numamem, addr, size, node,
> > > +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED |  
> > Is NVDIMM hotplug supported in QEMU?
> > If not we might need make MEM_AFFINITY_HOTPLUGGABLE conditional too.  
> 
> Yes, it's supported.
> 
> >   
> > > +                          (object_dynamic_cast(obj, TYPE_NVDIMM) ?
> > > +                           MEM_AFFINITY_NON_VOLATILE : 0));  
> > it might be cleaner without inline flags duplication
> > 
> >   flags = MEM_AFFINITY_ENABLED;
> >   ...
> >   if (!ent) {
> >       flags |= MEM_AFFINITY_HOTPLUGGABLE
> >   }
> >   ...
> >   if (PCDIMMDeviceInfo::hotpluggable) { // see ***
> >       flags |= MEM_AFFINITY_HOTPLUGGABLE
> >   }
> >   ...
> >   if (object_dynamic_cast(obj, TYPE_NVDIMM))
> >       flags |= MEM_AFFINITY_NON_VOLATILE
> >   }  
> 
> I'm fine for such changes, except ***
> 
> [..]
> > > diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
> > > index 6e74b61cb6..9fd901e87a 100644
> > > --- a/hw/mem/pc-dimm.c
> > > +++ b/hw/mem/pc-dimm.c
> > > @@ -276,6 +276,14 @@ static int pc_dimm_built_list(Object *obj, void *opaque)
> > >      return 0;
> > >  }
> > >  
> > > +GSList *pc_dimm_get_device_list(void)
> > > +{
> > > +    GSList *list = NULL;
> > > +
> > > +    object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &list);
> > > +    return list;
> > > +}  
> > (***)
> > see http://lists.gnu.org/archive/html/qemu-ppc/2018-02/msg00271.html
> > You could do that in separate patch, so that it won't matter
> > whose patch got merged first and it won't affect the rest of patches.  
> 
> Sure, I can separate this part, but I would still like to use a list
> of PCDIMMDevice rather than a list of MemoryDeviceInfo. The latter
> would need to be extended to include NVDIMM information (e.g., adding
> a NVDIMMDeviceInfo to the union).
You don't have to add NVDIMMDeviceInfo until there would be
need to expose NVDIMM specific information.

qmp_pc_dimm_device_list() API is sufficient in this case
(modulo missing sorting).

Suggestion has been made to keep number of public APIs that do
almost the same at minimum.

> Haozhong
> 
> > 
> >   
> > >  uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
> > >                                 uint64_t address_space_size,
> > >                                 uint64_t *hint, uint64_t align, uint64_t size,
> > > diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
> > > index d83b957829..4cf5cc49e9 100644
> > > --- a/include/hw/mem/pc-dimm.h
> > > +++ b/include/hw/mem/pc-dimm.h
> > > @@ -100,4 +100,14 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
> > >                           MemoryRegion *mr, uint64_t align, Error **errp);
> > >  void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
> > >                             MemoryRegion *mr);
> > > +
> > > +/*
> > > + * Return a list of DeviceState of pc-dimm and nvdimm devices. The
> > > + * list is sorted in the ascendant order of the base address of
> > > + * devices.
> > > + *
> > > + * Note: callers are responsible to free the list.
> > > + */
> > > +GSList *pc_dimm_get_device_list(void);
> > > +
> > >  #endif  
> >
Haozhong Zhang March 1, 2018, 1:12 p.m. UTC | #4
On 03/01/18 14:01 +0100, Igor Mammedov wrote:
> On Thu, 1 Mar 2018 19:56:51 +0800
> Haozhong Zhang <haozhong.zhang@intel.com> wrote:
> 
> > On 03/01/18 11:42 +0100, Igor Mammedov wrote:
> > > On Wed, 28 Feb 2018 12:02:58 +0800
> > > Haozhong Zhang <haozhong.zhang@intel.com> wrote:
> > >   
> > > > ACPI 6.2A Table 5-129 "SPA Range Structure" requires the proximity
> > > > domain of a NVDIMM SPA range must match with corresponding entry in
> > > > SRAT table.
> > > > 
> > > > The address ranges of vNVDIMM in QEMU are allocated from the
> > > > hot-pluggable address space, which is entirely covered by one SRAT
> > > > memory affinity structure. However, users can set the vNVDIMM
> > > > proximity domain in NFIT SPA range structure by the 'node' property of
> > > > '-device nvdimm' to a value different than the one in the above SRAT
> > > > memory affinity structure.
> > > > 
> > > > In order to solve such proximity domain mismatch, this patch builds
> > > > one SRAT memory affinity structure for each static-plugged DIMM device,  
> > > s/static-plugged/present at boot/
> > > since after hotplug and following reset SRAT will be recreated
> > > and include hotplugged DIMMs as well.  
> > 
> > Ah yes, I'll fix the message in the next version.
> > 
> > >   
> > > > including both PC-DIMM and NVDIMM, with the proximity domain specified
> > > > in '-device pc-dimm' or '-device nvdimm'.
> > > > 
> > > > The remaining hot-pluggable address space is covered by one or multiple
> > > > SRAT memory affinity structures with the proximity domain of the last
> > > > node as before.
> > > > 
> > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > > ---
> > > >  hw/i386/acpi-build.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++----
> > > >  hw/mem/pc-dimm.c         |  8 ++++++++
> > > >  include/hw/mem/pc-dimm.h | 10 ++++++++++
> > > >  3 files changed, 64 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > > > index deb440f286..a88de06d8f 100644
> > > > --- a/hw/i386/acpi-build.c
> > > > +++ b/hw/i386/acpi-build.c
> > > > @@ -2323,6 +2323,49 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
> > > >  #define HOLE_640K_START  (640 * 1024)
> > > >  #define HOLE_640K_END   (1024 * 1024)
> > > >  
> > > > +static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base,
> > > > +                                           uint64_t len, int default_node)
> > > > +{
> > > > +    GSList *dimms = pc_dimm_get_device_list();
> > > > +    GSList *ent = dimms;
> > > > +    PCDIMMDevice *dev;
> > > > +    Object *obj;
> > > > +    uint64_t end = base + len, addr, size;
> > > > +    int node;
> > > > +    AcpiSratMemoryAffinity *numamem;
> > > > +
> > > > +    while (base < end) {  
> > > It's just matter of taste but wouldn't 'for' loop be better here?
> > > One can see start, end and next step from the begging.  
> > 
> > will switch to a for loop
> > 
> > >   
> > > > +        numamem = acpi_data_push(table_data, sizeof *numamem);
> > > > +
> > > > +        if (!ent) {
> > > > +            build_srat_memory(numamem, base, end - base, default_node,
> > > > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > > > +            break;
> > > > +        }
> > > > +
> > > > +        dev = PC_DIMM(ent->data);
> > > > +        obj = OBJECT(dev);
> > > > +        addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, NULL);
> > > > +        size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL);
> > > > +        node = object_property_get_uint(obj, PC_DIMM_NODE_PROP, NULL);
> > > > +
> > > > +        if (base < addr) {
> > > > +            build_srat_memory(numamem, base, addr - base, default_node,
> > > > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > > > +            numamem = acpi_data_push(table_data, sizeof *numamem);
> > > > +        }
> > > > +        build_srat_memory(numamem, addr, size, node,
> > > > +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED |  
> > > Is NVDIMM hotplug supported in QEMU?
> > > If not we might need make MEM_AFFINITY_HOTPLUGGABLE conditional too.  
> > 
> > Yes, it's supported.
> > 
> > >   
> > > > +                          (object_dynamic_cast(obj, TYPE_NVDIMM) ?
> > > > +                           MEM_AFFINITY_NON_VOLATILE : 0));  
> > > it might be cleaner without inline flags duplication
> > > 
> > >   flags = MEM_AFFINITY_ENABLED;
> > >   ...
> > >   if (!ent) {
> > >       flags |= MEM_AFFINITY_HOTPLUGGABLE
> > >   }
> > >   ...
> > >   if (PCDIMMDeviceInfo::hotpluggable) { // see ***
> > >       flags |= MEM_AFFINITY_HOTPLUGGABLE
> > >   }
> > >   ...
> > >   if (object_dynamic_cast(obj, TYPE_NVDIMM))
> > >       flags |= MEM_AFFINITY_NON_VOLATILE
> > >   }  
> > 
> > I'm fine for such changes, except ***
> > 
> > [..]
> > > > diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
> > > > index 6e74b61cb6..9fd901e87a 100644
> > > > --- a/hw/mem/pc-dimm.c
> > > > +++ b/hw/mem/pc-dimm.c
> > > > @@ -276,6 +276,14 @@ static int pc_dimm_built_list(Object *obj, void *opaque)
> > > >      return 0;
> > > >  }
> > > >  
> > > > +GSList *pc_dimm_get_device_list(void)
> > > > +{
> > > > +    GSList *list = NULL;
> > > > +
> > > > +    object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &list);
> > > > +    return list;
> > > > +}  
> > > (***)
> > > see http://lists.gnu.org/archive/html/qemu-ppc/2018-02/msg00271.html
> > > You could do that in separate patch, so that it won't matter
> > > whose patch got merged first and it won't affect the rest of patches.  
> > 
> > Sure, I can separate this part, but I would still like to use a list
> > of PCDIMMDevice rather than a list of MemoryDeviceInfo. The latter
> > would need to be extended to include NVDIMM information (e.g., adding
> > a NVDIMMDeviceInfo to the union).
> You don't have to add NVDIMMDeviceInfo until there would be
> need to expose NVDIMM specific information.

Well, I need to know whether a memory device is NVDIMM in order to
decide whether the non-volatile flag is need in SRAT.

> 
> qmp_pc_dimm_device_list() API is sufficient in this case
> (modulo missing sorting).

sorting is not a big issue and can be easily added by using
pc_dimm_built_list in qmp_pc_dimm_device_list().

Haozhong

> 
> Suggestion has been made to keep number of public APIs that do
> almost the same at minimum.
Igor Mammedov March 1, 2018, 4:06 p.m. UTC | #5
On Thu, 1 Mar 2018 21:12:37 +0800
Haozhong Zhang <haozhong.zhang@intel.com> wrote:

> On 03/01/18 14:01 +0100, Igor Mammedov wrote:
> > On Thu, 1 Mar 2018 19:56:51 +0800
> > Haozhong Zhang <haozhong.zhang@intel.com> wrote:
> >   
> > > On 03/01/18 11:42 +0100, Igor Mammedov wrote:  
> > > > On Wed, 28 Feb 2018 12:02:58 +0800
> > > > Haozhong Zhang <haozhong.zhang@intel.com> wrote:
> > > >     
> > > > > ACPI 6.2A Table 5-129 "SPA Range Structure" requires the proximity
> > > > > domain of a NVDIMM SPA range must match with corresponding entry in
> > > > > SRAT table.
> > > > > 
> > > > > The address ranges of vNVDIMM in QEMU are allocated from the
> > > > > hot-pluggable address space, which is entirely covered by one SRAT
> > > > > memory affinity structure. However, users can set the vNVDIMM
> > > > > proximity domain in NFIT SPA range structure by the 'node' property of
> > > > > '-device nvdimm' to a value different than the one in the above SRAT
> > > > > memory affinity structure.
> > > > > 
> > > > > In order to solve such proximity domain mismatch, this patch builds
> > > > > one SRAT memory affinity structure for each static-plugged DIMM device,    
> > > > s/static-plugged/present at boot/
> > > > since after hotplug and following reset SRAT will be recreated
> > > > and include hotplugged DIMMs as well.    
> > > 
> > > Ah yes, I'll fix the message in the next version.
> > >   
> > > >     
> > > > > including both PC-DIMM and NVDIMM, with the proximity domain specified
> > > > > in '-device pc-dimm' or '-device nvdimm'.
> > > > > 
> > > > > The remaining hot-pluggable address space is covered by one or multiple
> > > > > SRAT memory affinity structures with the proximity domain of the last
> > > > > node as before.
> > > > > 
> > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > > > ---
> > > > >  hw/i386/acpi-build.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++----
> > > > >  hw/mem/pc-dimm.c         |  8 ++++++++
> > > > >  include/hw/mem/pc-dimm.h | 10 ++++++++++
> > > > >  3 files changed, 64 insertions(+), 4 deletions(-)
> > > > > 
> > > > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > > > > index deb440f286..a88de06d8f 100644
> > > > > --- a/hw/i386/acpi-build.c
> > > > > +++ b/hw/i386/acpi-build.c
> > > > > @@ -2323,6 +2323,49 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
> > > > >  #define HOLE_640K_START  (640 * 1024)
> > > > >  #define HOLE_640K_END   (1024 * 1024)
> > > > >  
> > > > > +static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base,
> > > > > +                                           uint64_t len, int default_node)
> > > > > +{
> > > > > +    GSList *dimms = pc_dimm_get_device_list();
> > > > > +    GSList *ent = dimms;
> > > > > +    PCDIMMDevice *dev;
> > > > > +    Object *obj;
> > > > > +    uint64_t end = base + len, addr, size;
> > > > > +    int node;
> > > > > +    AcpiSratMemoryAffinity *numamem;
> > > > > +
> > > > > +    while (base < end) {    
> > > > It's just matter of taste but wouldn't 'for' loop be better here?
> > > > One can see start, end and next step from the begging.    
> > > 
> > > will switch to a for loop
> > >   
> > > >     
> > > > > +        numamem = acpi_data_push(table_data, sizeof *numamem);
> > > > > +
> > > > > +        if (!ent) {
> > > > > +            build_srat_memory(numamem, base, end - base, default_node,
> > > > > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        dev = PC_DIMM(ent->data);
> > > > > +        obj = OBJECT(dev);
> > > > > +        addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, NULL);
> > > > > +        size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL);
> > > > > +        node = object_property_get_uint(obj, PC_DIMM_NODE_PROP, NULL);
> > > > > +
> > > > > +        if (base < addr) {
> > > > > +            build_srat_memory(numamem, base, addr - base, default_node,
> > > > > +                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> > > > > +            numamem = acpi_data_push(table_data, sizeof *numamem);
> > > > > +        }
> > > > > +        build_srat_memory(numamem, addr, size, node,
> > > > > +                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED |    
> > > > Is NVDIMM hotplug supported in QEMU?
> > > > If not we might need make MEM_AFFINITY_HOTPLUGGABLE conditional too.    
> > > 
> > > Yes, it's supported.
> > >   
> > > >     
> > > > > +                          (object_dynamic_cast(obj, TYPE_NVDIMM) ?
> > > > > +                           MEM_AFFINITY_NON_VOLATILE : 0));    
> > > > it might be cleaner without inline flags duplication
> > > > 
> > > >   flags = MEM_AFFINITY_ENABLED;
> > > >   ...
> > > >   if (!ent) {
> > > >       flags |= MEM_AFFINITY_HOTPLUGGABLE
> > > >   }
> > > >   ...
> > > >   if (PCDIMMDeviceInfo::hotpluggable) { // see ***
> > > >       flags |= MEM_AFFINITY_HOTPLUGGABLE
> > > >   }
> > > >   ...
> > > >   if (object_dynamic_cast(obj, TYPE_NVDIMM))
> > > >       flags |= MEM_AFFINITY_NON_VOLATILE
> > > >   }    
> > > 
> > > I'm fine for such changes, except ***
> > > 
> > > [..]  
> > > > > diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
> > > > > index 6e74b61cb6..9fd901e87a 100644
> > > > > --- a/hw/mem/pc-dimm.c
> > > > > +++ b/hw/mem/pc-dimm.c
> > > > > @@ -276,6 +276,14 @@ static int pc_dimm_built_list(Object *obj, void *opaque)
> > > > >      return 0;
> > > > >  }
> > > > >  
> > > > > +GSList *pc_dimm_get_device_list(void)
> > > > > +{
> > > > > +    GSList *list = NULL;
> > > > > +
> > > > > +    object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &list);
> > > > > +    return list;
> > > > > +}    
> > > > (***)
> > > > see http://lists.gnu.org/archive/html/qemu-ppc/2018-02/msg00271.html
> > > > You could do that in separate patch, so that it won't matter
> > > > whose patch got merged first and it won't affect the rest of patches.    
> > > 
> > > Sure, I can separate this part, but I would still like to use a list
> > > of PCDIMMDevice rather than a list of MemoryDeviceInfo. The latter
> > > would need to be extended to include NVDIMM information (e.g., adding
> > > a NVDIMMDeviceInfo to the union).  
> > You don't have to add NVDIMMDeviceInfo until there would be
> > need to expose NVDIMM specific information.  
> 
> Well, I need to know whether a memory device is NVDIMM in order to
> decide whether the non-volatile flag is need in SRAT.
Maybe we should add NVDIMMDeviceInfo after all,
extra benefit of it would be that HMP 'info memory-devices' and
QMP query-memory-devices would show correct device type.

We can do something like this for starters

+##
+# @NVDIMMDeviceInfo:
+#
+# 'nvdimm' device state information
+#
+# Since: 2.12
+##
+{ 'struct': 'NVDIMMDeviceInfo',
+   'base': 'PCDIMMDeviceInfo',
+   'data': {}
+}

and later extend 'data' section with nvdimm specific data when necessary
 
> > qmp_pc_dimm_device_list() API is sufficient in this case
> > (modulo missing sorting).  
> 
> sorting is not a big issue and can be easily added by using
> pc_dimm_built_list in qmp_pc_dimm_device_list().
> 
> Haozhong
> 
> > 
> > Suggestion has been made to keep number of public APIs that do
> > almost the same at minimum.  
>
diff mbox

Patch

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index deb440f286..a88de06d8f 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2323,6 +2323,49 @@  build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
 #define HOLE_640K_START  (640 * 1024)
 #define HOLE_640K_END   (1024 * 1024)
 
+static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base,
+                                           uint64_t len, int default_node)
+{
+    GSList *dimms = pc_dimm_get_device_list();
+    GSList *ent = dimms;
+    PCDIMMDevice *dev;
+    Object *obj;
+    uint64_t end = base + len, addr, size;
+    int node;
+    AcpiSratMemoryAffinity *numamem;
+
+    while (base < end) {
+        numamem = acpi_data_push(table_data, sizeof *numamem);
+
+        if (!ent) {
+            build_srat_memory(numamem, base, end - base, default_node,
+                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
+            break;
+        }
+
+        dev = PC_DIMM(ent->data);
+        obj = OBJECT(dev);
+        addr = object_property_get_uint(obj, PC_DIMM_ADDR_PROP, NULL);
+        size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL);
+        node = object_property_get_uint(obj, PC_DIMM_NODE_PROP, NULL);
+
+        if (base < addr) {
+            build_srat_memory(numamem, base, addr - base, default_node,
+                              MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
+            numamem = acpi_data_push(table_data, sizeof *numamem);
+        }
+        build_srat_memory(numamem, addr, size, node,
+                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED |
+                          (object_dynamic_cast(obj, TYPE_NVDIMM) ?
+                           MEM_AFFINITY_NON_VOLATILE : 0));
+
+        base = addr + size;
+        ent = g_slist_next(ent);
+    }
+
+    g_slist_free(dimms);
+}
+
 static void
 build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 {
@@ -2434,10 +2477,9 @@  build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
      * providing _PXM method if necessary.
      */
     if (hotplugabble_address_space_size) {
-        numamem = acpi_data_push(table_data, sizeof *numamem);
-        build_srat_memory(numamem, pcms->hotplug_memory.base,
-                          hotplugabble_address_space_size, pcms->numa_nodes - 1,
-                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
+        build_srat_hotpluggable_memory(table_data, pcms->hotplug_memory.base,
+                                       hotplugabble_address_space_size,
+                                       pcms->numa_nodes - 1);
     }
 
     build_header(linker, table_data,
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 6e74b61cb6..9fd901e87a 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -276,6 +276,14 @@  static int pc_dimm_built_list(Object *obj, void *opaque)
     return 0;
 }
 
+GSList *pc_dimm_get_device_list(void)
+{
+    GSList *list = NULL;
+
+    object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &list);
+    return list;
+}
+
 uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
                                uint64_t address_space_size,
                                uint64_t *hint, uint64_t align, uint64_t size,
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index d83b957829..4cf5cc49e9 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -100,4 +100,14 @@  void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
                          MemoryRegion *mr, uint64_t align, Error **errp);
 void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
                            MemoryRegion *mr);
+
+/*
+ * Return a list of DeviceState of pc-dimm and nvdimm devices. The
+ * list is sorted in the ascendant order of the base address of
+ * devices.
+ *
+ * Note: callers are responsible to free the list.
+ */
+GSList *pc_dimm_get_device_list(void);
+
 #endif