diff mbox

[RFC,2/2] numa: Add node_id data in query-hotpluggable-cpus

Message ID 417b83d0e074e2004aa35bef337d32ac2c89f559.1467904342.git.pkrempa@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Krempa July 7, 2016, 3:17 p.m. UTC
Add a helper that looks up the NUMA node for a given CPU and use it to
fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.

Signed-off-by: Peter Krempa <pkrempa@redhat.com>
---
 hw/i386/pc.c          |  7 +++++++
 hw/ppc/spapr.c        |  8 ++++++--
 include/sysemu/numa.h |  1 +
 numa.c                | 13 +++++++++++++
 4 files changed, 27 insertions(+), 2 deletions(-)

Comments

Andrew Jones July 7, 2016, 4:10 p.m. UTC | #1
On Thu, Jul 07, 2016 at 05:17:14PM +0200, Peter Krempa wrote:
> Add a helper that looks up the NUMA node for a given CPU and use it to
> fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.
> 
> Signed-off-by: Peter Krempa <pkrempa@redhat.com>
> ---
>  hw/i386/pc.c          |  7 +++++++
>  hw/ppc/spapr.c        |  8 ++++++--
>  include/sysemu/numa.h |  1 +
>  numa.c                | 13 +++++++++++++
>  4 files changed, 27 insertions(+), 2 deletions(-)

The helper function should be a separate patch, and Igor beat you
to it, see

https://www.mail-archive.com/qemu-devel@nongnu.org/msg383461.html

drew

> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 4ba02c4..a0b9507 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -2115,6 +2115,7 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
>      HotpluggableCPUList *head = NULL;
>      PCMachineState *pcms = PC_MACHINE(machine);
>      const char *cpu_type;
> +    int node_id;
> 
>      cpu = pcms->possible_cpus->cpus[0].cpu;
>      assert(cpu); /* BSP is always present */
> @@ -2138,6 +2139,12 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
>          cpu_props->core_id = topo.core_id;
>          cpu_props->has_thread_id = true;
>          cpu_props->thread_id = topo.smt_id;
> +
> +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
> +            cpu_props->has_node_id = true;
> +            cpu_props->node_id = node_id;
> +        }
> +
>          cpu_item->props = cpu_props;
> 
>          cpu = pcms->possible_cpus->cpus[i].cpu;
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d1f5195..06ba7fc 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2370,6 +2370,7 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
>      sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
>      int spapr_max_cores = max_cpus / smp_threads;
>      int smt = kvmppc_smt_threads();
> +    int node_id;
> 
>      for (i = 0; i < spapr_max_cores; i++) {
>          HotpluggableCPUList *list_item = g_new0(typeof(*list_item), 1);
> @@ -2381,8 +2382,11 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
>          cpu_item->vcpu_id = i;
>          cpu_props->has_core_id = true;
>          cpu_props->core_id = i * smt;
> -        /* TODO: add 'has_node/node' here to describe
> -           to which node core belongs */
> +
> +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
> +            cpu_props->has_node_id = true;
> +            cpu_props->node_id = node_id;
> +        }
> 
>          cpu_item->props = cpu_props;
>          if (spapr->cores[i]) {
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index bb184c9..04d7097 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -31,5 +31,6 @@ extern QemuOptsList qemu_numa_opts;
>  void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
>  void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
>  uint32_t numa_get_node(ram_addr_t addr, Error **errp);
> +int numa_node_get_by_cpu_index(int cpu_index);
> 
>  #endif
> diff --git a/numa.c b/numa.c
> index cbae430..365738a 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -506,6 +506,19 @@ void query_numa_node_mem(uint64_t node_mem[])
>      }
>  }
> 
> +int numa_node_get_by_cpu_index(int cpu_index)
> +{
> +    int i;
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if (test_bit(cpu_index, numa_info[i].node_cpu)) {
> +            return i;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
>  static int query_memdev(Object *obj, void *opaque)
>  {
>      MemdevList **list = opaque;
> -- 
> 2.9.0
> 
>
David Gibson July 8, 2016, 2:23 a.m. UTC | #2
On Thu,  7 Jul 2016 17:17:14 +0200
Peter Krempa <pkrempa@redhat.com> wrote:

> Add a helper that looks up the NUMA node for a given CPU and use it to
> fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.


IIUC how the query thing works this means that the node id issued by
query-hotpluggable-cpus will be echoed back to device add by libvirt.
I'm not sure we actually process that information in the core at
present, so I don't know that that's right.

We need to be clear on which direction information is flowing here.
Does query-hotpluggable-cpus *define* the NUMA node allocation which is
then passed to the core device which implements it.  Or is the NUMA
allocation defined elsewhere, and query-hotpluggable-cpus just reports
it.

> Signed-off-by: Peter Krempa <pkrempa@redhat.com>
> ---
>  hw/i386/pc.c          |  7 +++++++
>  hw/ppc/spapr.c        |  8 ++++++--
>  include/sysemu/numa.h |  1 +
>  numa.c                | 13 +++++++++++++
>  4 files changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 4ba02c4..a0b9507 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -2115,6 +2115,7 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
>      HotpluggableCPUList *head = NULL;
>      PCMachineState *pcms = PC_MACHINE(machine);
>      const char *cpu_type;
> +    int node_id;
> 
>      cpu = pcms->possible_cpus->cpus[0].cpu;
>      assert(cpu); /* BSP is always present */
> @@ -2138,6 +2139,12 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
>          cpu_props->core_id = topo.core_id;
>          cpu_props->has_thread_id = true;
>          cpu_props->thread_id = topo.smt_id;
> +
> +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
> +            cpu_props->has_node_id = true;
> +            cpu_props->node_id = node_id;
> +        }
> +
>          cpu_item->props = cpu_props;
> 
>          cpu = pcms->possible_cpus->cpus[i].cpu;
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d1f5195..06ba7fc 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2370,6 +2370,7 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
>      sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
>      int spapr_max_cores = max_cpus / smp_threads;
>      int smt = kvmppc_smt_threads();
> +    int node_id;
> 
>      for (i = 0; i < spapr_max_cores; i++) {
>          HotpluggableCPUList *list_item = g_new0(typeof(*list_item), 1);
> @@ -2381,8 +2382,11 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
>          cpu_item->vcpu_id = i;
>          cpu_props->has_core_id = true;
>          cpu_props->core_id = i * smt;
> -        /* TODO: add 'has_node/node' here to describe
> -           to which node core belongs */
> +
> +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {

As with the previous patch this is incorrect, becauyse
numa_node_get_by_cpu_index() is working from a vcpu (i.e. thread)
index, but you're passing a core index.

> +            cpu_props->has_node_id = true;
> +            cpu_props->node_id = node_id;
> +        }
> 
>          cpu_item->props = cpu_props;
>          if (spapr->cores[i]) {
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index bb184c9..04d7097 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -31,5 +31,6 @@ extern QemuOptsList qemu_numa_opts;
>  void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
>  void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
>  uint32_t numa_get_node(ram_addr_t addr, Error **errp);
> +int numa_node_get_by_cpu_index(int cpu_index);
> 
>  #endif
> diff --git a/numa.c b/numa.c
> index cbae430..365738a 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -506,6 +506,19 @@ void query_numa_node_mem(uint64_t node_mem[])
>      }
>  }
> 
> +int numa_node_get_by_cpu_index(int cpu_index)
> +{
> +    int i;
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if (test_bit(cpu_index, numa_info[i].node_cpu)) {
> +            return i;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
>  static int query_memdev(Object *obj, void *opaque)
>  {
>      MemdevList **list = opaque;
> -- 
> 2.9.0
>
Peter Krempa July 8, 2016, 7:46 a.m. UTC | #3
On Fri, Jul 08, 2016 at 12:23:08 +1000, David Gibson wrote:
> On Thu,  7 Jul 2016 17:17:14 +0200
> Peter Krempa <pkrempa@redhat.com> wrote:
> 
> > Add a helper that looks up the NUMA node for a given CPU and use it to
> > fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.
> 
> 
> IIUC how the query thing works this means that the node id issued by
> query-hotpluggable-cpus will be echoed back to device add by libvirt.

It will be echoed back, but the problem is that it's configured
separately ...

> I'm not sure we actually process that information in the core at
> present, so I don't know that that's right.
> 
> We need to be clear on which direction information is flowing here.
> Does query-hotpluggable-cpus *define* the NUMA node allocation which is
> then passed to the core device which implements it.  Or is the NUMA
> allocation defined elsewhere, and query-hotpluggable-cpus just reports
> it.

Currently (in the pre-hotplug era) the NUMA topology is defined by
specifying a CPU numbers (see previous patch) on the commandline:

-numa node=1,cpus=1-5,cpus=8,cpus=11...

This is then reported to the guest.

For a machine started with:

-smp 5,maxcpus=8,sockets=2,cores=2,threads=2
-numa node,nodeid=0,cpus=0,cpus=2,cpus=4,cpus=6,mem=500
-numa node,nodeid=1,cpus=1,cpus=3,cpus=5,cpus=7,mem=500

you get the following topology that is not really possible with
hardware:

# lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                5
On-line CPU(s) list:   0-4
Thread(s) per core:    1
Core(s) per socket:    2
Socket(s):             2
NUMA node(s):          2
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 6
Model name:            QEMU Virtual CPU version 2.5+
Stepping:              3
CPU MHz:               3504.318
BogoMIPS:              7008.63
Hypervisor vendor:     KVM
Virtualization type:   full
L1d cache:             32K
L1i cache:             32K
L2 cache:              4096K
NUMA node0 CPU(s):     0,2,4
NUMA node1 CPU(s):     1,3

Note that the count of cpus per numa node does not need to be identical.

As of the above 'query-hotpluggable-cpus' will need to report the data
that was configured above even if it doesn't make much sense in a real
world.

I did not try the above on a PPC host and thus I'm not sure whether the
config above is allowed or not.

While for the hotplug cpus it would be possible to plug in the correct
one according to the requested use the queried data but with the current
approach it's impossible to set the initial vcpus differently.

Peter

Note: For libvirt it's a no-go to start a throwaway qemu process just to
query the information and thus it's desired to have a way to configure
all this without the need to query with a specific machine
type/topology.
Igor Mammedov July 8, 2016, 11:54 a.m. UTC | #4
On Thu,  7 Jul 2016 17:17:14 +0200
Peter Krempa <pkrempa@redhat.com> wrote:

> Add a helper that looks up the NUMA node for a given CPU and use it to
> fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.
> 
> Signed-off-by: Peter Krempa <pkrempa@redhat.com>
> ---
>  hw/i386/pc.c          |  7 +++++++
>  hw/ppc/spapr.c        |  8 ++++++--
>  include/sysemu/numa.h |  1 +
>  numa.c                | 13 +++++++++++++
>  4 files changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 4ba02c4..a0b9507 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -2115,6 +2115,7 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
>      HotpluggableCPUList *head = NULL;
>      PCMachineState *pcms = PC_MACHINE(machine);
>      const char *cpu_type;
> +    int node_id;
> 
>      cpu = pcms->possible_cpus->cpus[0].cpu;
>      assert(cpu); /* BSP is always present */
> @@ -2138,6 +2139,12 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
>          cpu_props->core_id = topo.core_id;
>          cpu_props->has_thread_id = true;
>          cpu_props->thread_id = topo.smt_id;
> +
> +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
> +            cpu_props->has_node_id = true;
> +            cpu_props->node_id = node_id;
> +        }
I've not included node_id for a reason,
 "-numa cpus=1,2,3..." looks to me hopelessly broken now but
I've not came up with an idea how to redo it in nice and clean way yet.

Alternative could be CLI-less numa configuration, where QEMU is started
without "-numa cpus" but with "-S" then mgmt could call
query_hotpluggable_cpus() to get possible CPUs and then
map them to numa  nodes with a new QMP command using attributes
it got from query_hotpluggable_cpus().

it's along the way start QEMU -smp 1,maxcpus=X and then add
remaining CPUs with device_add after getting properties from
query_hotpluggable_cpus().

then at machine_done time we can adjust DT/ACPI data to reflect
configured mapping.

>          cpu_item->props = cpu_props;
> 
>          cpu = pcms->possible_cpus->cpus[i].cpu;
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d1f5195..06ba7fc 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2370,6 +2370,7 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
>      sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
>      int spapr_max_cores = max_cpus / smp_threads;
>      int smt = kvmppc_smt_threads();
> +    int node_id;
> 
>      for (i = 0; i < spapr_max_cores; i++) {
>          HotpluggableCPUList *list_item = g_new0(typeof(*list_item), 1);
> @@ -2381,8 +2382,11 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
>          cpu_item->vcpu_id = i;
>          cpu_props->has_core_id = true;
>          cpu_props->core_id = i * smt;
> -        /* TODO: add 'has_node/node' here to describe
> -           to which node core belongs */
> +
> +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
> +            cpu_props->has_node_id = true;
> +            cpu_props->node_id = node_id;
> +        }
> 
>          cpu_item->props = cpu_props;
>          if (spapr->cores[i]) {
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index bb184c9..04d7097 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -31,5 +31,6 @@ extern QemuOptsList qemu_numa_opts;
>  void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
>  void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
>  uint32_t numa_get_node(ram_addr_t addr, Error **errp);
> +int numa_node_get_by_cpu_index(int cpu_index);
> 
>  #endif
> diff --git a/numa.c b/numa.c
> index cbae430..365738a 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -506,6 +506,19 @@ void query_numa_node_mem(uint64_t node_mem[])
>      }
>  }
> 
> +int numa_node_get_by_cpu_index(int cpu_index)
> +{
> +    int i;
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if (test_bit(cpu_index, numa_info[i].node_cpu)) {
> +            return i;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
>  static int query_memdev(Object *obj, void *opaque)
>  {
>      MemdevList **list = opaque;
Peter Krempa July 8, 2016, 12:04 p.m. UTC | #5
On Fri, Jul 08, 2016 at 13:54:58 +0200, Igor Mammedov wrote:
> On Thu,  7 Jul 2016 17:17:14 +0200
> Peter Krempa <pkrempa@redhat.com> wrote:
> 
> > Add a helper that looks up the NUMA node for a given CPU and use it to
> > fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.
> > 
> > Signed-off-by: Peter Krempa <pkrempa@redhat.com>
> > ---
> >  hw/i386/pc.c          |  7 +++++++
> >  hw/ppc/spapr.c        |  8 ++++++--
> >  include/sysemu/numa.h |  1 +
> >  numa.c                | 13 +++++++++++++
> >  4 files changed, 27 insertions(+), 2 deletions(-)
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 4ba02c4..a0b9507 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -2115,6 +2115,7 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
> >      HotpluggableCPUList *head = NULL;
> >      PCMachineState *pcms = PC_MACHINE(machine);
> >      const char *cpu_type;
> > +    int node_id;
> > 
> >      cpu = pcms->possible_cpus->cpus[0].cpu;
> >      assert(cpu); /* BSP is always present */
> > @@ -2138,6 +2139,12 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
> >          cpu_props->core_id = topo.core_id;
> >          cpu_props->has_thread_id = true;
> >          cpu_props->thread_id = topo.smt_id;
> > +
> > +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
> > +            cpu_props->has_node_id = true;
> > +            cpu_props->node_id = node_id;
> > +        }
> I've not included node_id for a reason,
>  "-numa cpus=1,2,3..." looks to me hopelessly broken now but
> I've not came up with an idea how to redo it in nice and clean way yet.
> 
> Alternative could be CLI-less numa configuration, where QEMU is started
> without "-numa cpus" but with "-S" then mgmt could call
> query_hotpluggable_cpus() to get possible CPUs and then
> map them to numa  nodes with a new QMP command using attributes
> it got from query_hotpluggable_cpus().

I think this could work for libvirt. The CPU index we currently expose
in the XML would become just a libvirt internal detail and the new QMP
command would be used to do the setup. Adding some QMP calls during VM
startup is okay.

> it's along the way start QEMU -smp 1,maxcpus=X and then add
> remaining CPUs with device_add after getting properties from
> query_hotpluggable_cpus().

I'm going to use a similar approach even for the hotpluggable cpus so I
can query the data for a new VM. On the other hand I can't make libvirt
use the approach with -smp 1,... all the time since we guarantee that a
XML that worked on a older version will be migratable back to the older
version.

> then at machine_done time we can adjust DT/ACPI data to reflect
> configured mapping.

In such case this series can be dropped since it provides what I need
differently.

Thanks,

Peter
Igor Mammedov July 8, 2016, 12:06 p.m. UTC | #6
On Fri, 8 Jul 2016 09:46:00 +0200
Peter Krempa <pkrempa@redhat.com> wrote:

[...]
> Note: For libvirt it's a no-go to start a throwaway qemu process just to
> query the information and thus it's desired to have a way to configure
> all this without the need to query with a specific machine
> type/topology.
Is it no-go to start throwaway qemu on the new domain creation time?
i.e. user create a new domain, with specific -machine/-smp layout
libvirt stores results of query-hotpluggable-cpus and then allow user
in (eg)virt-manager to pick which CPUs are enabled and optionally to
which numa nodes they are mapped.
Igor Mammedov July 8, 2016, 12:10 p.m. UTC | #7
On Fri, 8 Jul 2016 14:04:23 +0200
Peter Krempa <pkrempa@redhat.com> wrote:

> On Fri, Jul 08, 2016 at 13:54:58 +0200, Igor Mammedov wrote:
> > On Thu,  7 Jul 2016 17:17:14 +0200
> > Peter Krempa <pkrempa@redhat.com> wrote:
> >   
> > > Add a helper that looks up the NUMA node for a given CPU and use it to
> > > fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.
> > > 
> > > Signed-off-by: Peter Krempa <pkrempa@redhat.com>
> > > ---
> > >  hw/i386/pc.c          |  7 +++++++
> > >  hw/ppc/spapr.c        |  8 ++++++--
> > >  include/sysemu/numa.h |  1 +
> > >  numa.c                | 13 +++++++++++++
> > >  4 files changed, 27 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > > index 4ba02c4..a0b9507 100644
> > > --- a/hw/i386/pc.c
> > > +++ b/hw/i386/pc.c
> > > @@ -2115,6 +2115,7 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
> > >      HotpluggableCPUList *head = NULL;
> > >      PCMachineState *pcms = PC_MACHINE(machine);
> > >      const char *cpu_type;
> > > +    int node_id;
> > > 
> > >      cpu = pcms->possible_cpus->cpus[0].cpu;
> > >      assert(cpu); /* BSP is always present */
> > > @@ -2138,6 +2139,12 @@ static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
> > >          cpu_props->core_id = topo.core_id;
> > >          cpu_props->has_thread_id = true;
> > >          cpu_props->thread_id = topo.smt_id;
> > > +
> > > +        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
> > > +            cpu_props->has_node_id = true;
> > > +            cpu_props->node_id = node_id;
> > > +        }  
> > I've not included node_id for a reason,
> >  "-numa cpus=1,2,3..." looks to me hopelessly broken now but
> > I've not came up with an idea how to redo it in nice and clean way yet.
> > 
> > Alternative could be CLI-less numa configuration, where QEMU is started
> > without "-numa cpus" but with "-S" then mgmt could call
> > query_hotpluggable_cpus() to get possible CPUs and then
> > map them to numa  nodes with a new QMP command using attributes
> > it got from query_hotpluggable_cpus().  
> 
> I think this could work for libvirt. The CPU index we currently expose
> in the XML would become just a libvirt internal detail and the new QMP
> command would be used to do the setup. Adding some QMP calls during VM
> startup is okay.
> 
> > it's along the way start QEMU -smp 1,maxcpus=X and then add
> > remaining CPUs with device_add after getting properties from
> > query_hotpluggable_cpus().  
> 
> I'm going to use a similar approach even for the hotpluggable cpus so I
> can query the data for a new VM. On the other hand I can't make libvirt
> use the approach with -smp 1,... all the time since we guarantee that a
> XML that worked on a older version will be migratable back to the older
> version.
Could you explain a little bit more about issue?

> 
> > then at machine_done time we can adjust DT/ACPI data to reflect
> > configured mapping.  
> 
> In such case this series can be dropped since it provides what I need
> differently.
> 
> Thanks,
> 
> Peter
>
Peter Krempa July 8, 2016, 12:26 p.m. UTC | #8
On Fri, Jul 08, 2016 at 14:06:31 +0200, Igor Mammedov wrote:
> On Fri, 8 Jul 2016 09:46:00 +0200
> Peter Krempa <pkrempa@redhat.com> wrote:
> 
> [...]
> > Note: For libvirt it's a no-go to start a throwaway qemu process just to
> > query the information and thus it's desired to have a way to configure
> > all this without the need to query with a specific machine
> > type/topology.
> Is it no-go to start throwaway qemu on the new domain creation time?

Yes. The policy is that once we are starting the VM the qemu process
that we create is the one running the VM later. We can tweak stuff using
QMP and/or kill it if the configuration requested by the user can't be
achieved, but starting a different qemu process would not be accepted
upstream.

Capability querying is done prior to that with a qemu process with -M
none and the results are cached for the given combination of qemu binary
and libvirtd binary (even across restarts of the libvirtd binary).

> i.e. user create a new domain, with specific -machine/-smp layout
> libvirt stores results of query-hotpluggable-cpus and then allow user
> in (eg)virt-manager to pick which CPUs are enabled and optionally to
> which numa nodes they are mapped.

We can update certain stuff that was not explicitly configured by the
user to the state that will then be used by qemu.
Peter Krempa July 8, 2016, 12:53 p.m. UTC | #9
On Fri, Jul 08, 2016 at 14:10:59 +0200, Igor Mammedov wrote:
> On Fri, 8 Jul 2016 14:04:23 +0200
> Peter Krempa <pkrempa@redhat.com> wrote:
> 
> > On Fri, Jul 08, 2016 at 13:54:58 +0200, Igor Mammedov wrote:
> > > On Thu,  7 Jul 2016 17:17:14 +0200
> > > Peter Krempa <pkrempa@redhat.com> wrote:

[...]

> > > it's along the way start QEMU -smp 1,maxcpus=X and then add
> > > remaining CPUs with device_add after getting properties from
> > > query_hotpluggable_cpus().  
> > 
> > I'm going to use a similar approach even for the hotpluggable cpus so I
> > can query the data for a new VM. On the other hand I can't make libvirt
> > use the approach with -smp 1,... all the time since we guarantee that a
> > XML that worked on a older version will be migratable back to the older
> > version.
> Could you explain a little bit more about issue?

Libvirt attempts to maintain compatibility of the XML definition file
even for migrations to older versions.

If you are able to start a VM with a XML on libvirt version 'A' then
when you use the same XML to start it on a newer version 'B' we still
need to be able to migrate the VM back to 'A'. This means that vCPUs
added via -device/device_add can be used only for configurations that
will explicitly have configuration enabled.

This shouldn't be a problem generally, but this means that we still
either the 'old' way to work properly or a approach that is compatible
with migration.

For the specific case of vCPU hotplug this means that if you don't
hotplug any CPU it needs to work with older libvirt including the numa
topology and all other possible stuff.
David Gibson July 12, 2016, 3:27 a.m. UTC | #10
On Fri, 8 Jul 2016 09:46:00 +0200
Peter Krempa <pkrempa@redhat.com> wrote:

> On Fri, Jul 08, 2016 at 12:23:08 +1000, David Gibson wrote:
> > On Thu,  7 Jul 2016 17:17:14 +0200
> > Peter Krempa <pkrempa@redhat.com> wrote:
> >   
> > > Add a helper that looks up the NUMA node for a given CPU and use it to
> > > fill the node_id in the PPC and X86 impls of query-hotpluggable-cpus.  
> > 
> > 
> > IIUC how the query thing works this means that the node id issued by
> > query-hotpluggable-cpus will be echoed back to device add by libvirt.  
> 
> It will be echoed back, but the problem is that it's configured
> separately ...
> 
> > I'm not sure we actually process that information in the core at
> > present, so I don't know that that's right.
> > 
> > We need to be clear on which direction information is flowing here.
> > Does query-hotpluggable-cpus *define* the NUMA node allocation which is
> > then passed to the core device which implements it.  Or is the NUMA
> > allocation defined elsewhere, and query-hotpluggable-cpus just reports
> > it.  
> 
> Currently (in the pre-hotplug era) the NUMA topology is defined by
> specifying a CPU numbers (see previous patch) on the commandline:
> 
> -numa node=1,cpus=1-5,cpus=8,cpus=11...
> 
> This is then reported to the guest.
> 
> For a machine started with:
> 
> -smp 5,maxcpus=8,sockets=2,cores=2,threads=2
> -numa node,nodeid=0,cpus=0,cpus=2,cpus=4,cpus=6,mem=500
> -numa node,nodeid=1,cpus=1,cpus=3,cpus=5,cpus=7,mem=500
> 
> you get the following topology that is not really possible with
> hardware:
> 
> # lscpu
> Architecture:          x86_64
> CPU op-mode(s):        32-bit, 64-bit
> Byte Order:            Little Endian
> CPU(s):                5
> On-line CPU(s) list:   0-4
> Thread(s) per core:    1
> Core(s) per socket:    2
> Socket(s):             2
> NUMA node(s):          2
> Vendor ID:             GenuineIntel
> CPU family:            6
> Model:                 6
> Model name:            QEMU Virtual CPU version 2.5+
> Stepping:              3
> CPU MHz:               3504.318
> BogoMIPS:              7008.63
> Hypervisor vendor:     KVM
> Virtualization type:   full
> L1d cache:             32K
> L1i cache:             32K
> L2 cache:              4096K
> NUMA node0 CPU(s):     0,2,4
> NUMA node1 CPU(s):     1,3
> 
> Note that the count of cpus per numa node does not need to be identical.
> 
> As of the above 'query-hotpluggable-cpus' will need to report the data
> that was configured above even if it doesn't make much sense in a real
> world.
> 
> I did not try the above on a PPC host and thus I'm not sure whether the
> config above is allowed or not.

It's not - although I'm not sure that we actually have something
enforcing this.

However, single cores *must* be in the same NUMA node - there's no way
of reporting to the guest anything finer grained.

> While for the hotplug cpus it would be possible to plug in the correct
> one according to the requested use the queried data but with the current
> approach it's impossible to set the initial vcpus differently.
> 
> Peter
> 
> Note: For libvirt it's a no-go to start a throwaway qemu process just to
> query the information and thus it's desired to have a way to configure
> all this without the need to query with a specific machine
> type/topology.
diff mbox

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 4ba02c4..a0b9507 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -2115,6 +2115,7 @@  static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
     HotpluggableCPUList *head = NULL;
     PCMachineState *pcms = PC_MACHINE(machine);
     const char *cpu_type;
+    int node_id;

     cpu = pcms->possible_cpus->cpus[0].cpu;
     assert(cpu); /* BSP is always present */
@@ -2138,6 +2139,12 @@  static HotpluggableCPUList *pc_query_hotpluggable_cpus(MachineState *machine)
         cpu_props->core_id = topo.core_id;
         cpu_props->has_thread_id = true;
         cpu_props->thread_id = topo.smt_id;
+
+        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
+            cpu_props->has_node_id = true;
+            cpu_props->node_id = node_id;
+        }
+
         cpu_item->props = cpu_props;

         cpu = pcms->possible_cpus->cpus[i].cpu;
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d1f5195..06ba7fc 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2370,6 +2370,7 @@  static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
     sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
     int spapr_max_cores = max_cpus / smp_threads;
     int smt = kvmppc_smt_threads();
+    int node_id;

     for (i = 0; i < spapr_max_cores; i++) {
         HotpluggableCPUList *list_item = g_new0(typeof(*list_item), 1);
@@ -2381,8 +2382,11 @@  static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine)
         cpu_item->vcpu_id = i;
         cpu_props->has_core_id = true;
         cpu_props->core_id = i * smt;
-        /* TODO: add 'has_node/node' here to describe
-           to which node core belongs */
+
+        if ((node_id = numa_node_get_by_cpu_index(i)) >= 0) {
+            cpu_props->has_node_id = true;
+            cpu_props->node_id = node_id;
+        }

         cpu_item->props = cpu_props;
         if (spapr->cores[i]) {
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index bb184c9..04d7097 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -31,5 +31,6 @@  extern QemuOptsList qemu_numa_opts;
 void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
 void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
 uint32_t numa_get_node(ram_addr_t addr, Error **errp);
+int numa_node_get_by_cpu_index(int cpu_index);

 #endif
diff --git a/numa.c b/numa.c
index cbae430..365738a 100644
--- a/numa.c
+++ b/numa.c
@@ -506,6 +506,19 @@  void query_numa_node_mem(uint64_t node_mem[])
     }
 }

+int numa_node_get_by_cpu_index(int cpu_index)
+{
+    int i;
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        if (test_bit(cpu_index, numa_info[i].node_cpu)) {
+            return i;
+        }
+    }
+
+    return -1;
+}
+
 static int query_memdev(Object *obj, void *opaque)
 {
     MemdevList **list = opaque;