Message ID | 1503372250-5092-2-git-send-email-douly.fnst@cn.fujitsu.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Aug 22, 2017 at 11:24:09AM +0800, Dou Liyang wrote: > Currently, Using the fisrt node without memory on the machine makes > QEMU unhappy. With this example command line: > ... \ > -m 1024M,slots=4,maxmem=32G \ > -numa node,nodeid=0 \ > -numa node,mem=1024M,nodeid=1 \ > -numa node,nodeid=2 \ > -numa node,nodeid=3 \ > Guest reports "No NUMA configuration found" and the NUMA topology is > wrong. > > This is because when QEMU builds ACPI SRAT, it regards node0 as the > default node to deal with the memory hole(640K-1M). this means the > node0 must have some memory(>1M), but, actually it can have no > memory. > > Fix this problem by replace the node0 with the first node which has > memory on it. Add a new function for each node. Also do some cleanup. > > Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com> > --- > hw/i386/acpi-build.c | 78 +++++++++++++++++++++++++++++++++------------------- > 1 file changed, 50 insertions(+), 28 deletions(-) > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > index 98dd424..f93d712 100644 > --- a/hw/i386/acpi-build.c > +++ b/hw/i386/acpi-build.c > @@ -2318,15 +2318,43 @@ build_tpm2(GArray *table_data, BIOSLinker *linker) > (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL); > } > > +static uint64_t > +build_srat_node_entry(GArray *table_data, PCMachineState *pcms, > + int i, uint64_t mem_base, uint64_t mem_len) > +{ > + AcpiSratMemoryAffinity *numamem; > + uint64_t next_base; > + > + next_base = mem_base + mem_len; > + > + /* Cut out the ACPI_PCI hole */ > + if (mem_base <= pcms->below_4g_mem_size && > + next_base > pcms->below_4g_mem_size) { > + mem_len -= next_base - pcms->below_4g_mem_size; > + if (mem_len > 0) { > + numamem = acpi_data_push(table_data, sizeof *numamem); > + build_srat_memory(numamem, mem_base, mem_len, i, > + MEM_AFFINITY_ENABLED); > + } > + mem_base = 1ULL << 32; > + mem_len = next_base - pcms->below_4g_mem_size; > + next_base += (1ULL << 32) - pcms->below_4g_mem_size; > + } > + numamem = acpi_data_push(table_data, sizeof *numamem); > + build_srat_memory(numamem, mem_base, mem_len, i, > + MEM_AFFINITY_ENABLED); > + return next_base; > +} > + > static void > build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) > { > AcpiSystemResourceAffinityTable *srat; > AcpiSratMemoryAffinity *numamem; > > - int i; > + int i, node; > int srat_start, numa_start, slots; > - uint64_t mem_len, mem_base, next_base; > + uint64_t mem_len, mem_base; > MachineClass *mc = MACHINE_GET_CLASS(machine); > const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(machine); > PCMachineState *pcms = PC_MACHINE(machine); > @@ -2370,36 +2398,30 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) > /* the memory map is a bit tricky, it contains at least one hole > * from 640k-1M and possibly another one from 3.5G-4G. > */ > - next_base = 0; > + > numa_start = table_data->len; > > - numamem = acpi_data_push(table_data, sizeof *numamem); > - build_srat_memory(numamem, 0, 640 * 1024, 0, MEM_AFFINITY_ENABLED); > - next_base = 1024 * 1024; > - for (i = 1; i < pcms->numa_nodes + 1; ++i) { > - mem_base = next_base; > - mem_len = pcms->node_mem[i - 1]; > - if (i == 1) { > - mem_len -= 1024 * 1024; > + /* get the first node which has memory and map the hole from 640K-1M */ > + for (node = 0; node < pcms->numa_nodes; node++) { > + if (pcms->node_mem[node] != 0) { > + break; > } > - next_base = mem_base + mem_len; > - > - /* Cut out the ACPI_PCI hole */ > - if (mem_base <= pcms->below_4g_mem_size && > - next_base > pcms->below_4g_mem_size) { > - mem_len -= next_base - pcms->below_4g_mem_size; > - if (mem_len > 0) { > - numamem = acpi_data_push(table_data, sizeof *numamem); > - build_srat_memory(numamem, mem_base, mem_len, i - 1, > - MEM_AFFINITY_ENABLED); > - } > - mem_base = 1ULL << 32; > - mem_len = next_base - pcms->below_4g_mem_size; > - next_base += (1ULL << 32) - pcms->below_4g_mem_size; > + } > + numamem = acpi_data_push(table_data, sizeof *numamem); > + build_srat_memory(numamem, 0, 640 * 1024, node, MEM_AFFINITY_ENABLED); > + > + /* map the rest of memory from 1M */ > + mem_base = 1024 * 1024; > + mem_len = pcms->node_mem[node] - mem_base; > + mem_base = build_srat_node_entry(table_data, pcms, node, > + mem_base, mem_len); > + > + for (i = 0; i < pcms->numa_nodes; i++) { > + if (i == node) { > + continue; > } Now the nodes will be out of order, if node 0 has no RAM. Why? Why not implement this in the same way the PCI 4GB hole is already implemented. e.g.: #define HOLE_640K_START (640 * 1024) #define HOLE_640K_END (1024 * 1024) for (node = 0; node < pcms->numa_nodes; node++) { mem_base = next_base; mem_len = pcms->node_mem[node]; next_base = mem_base + mem_len; /* Cut out the 640K hole */ if (mem_base <= HOLE_640K_START && next_base > HOLE_640K_START) { mem_len -= next_base - HOLE_640K; if (mem_len > 0) { numamem = acpi_data_push(table_data, sizeof *numamem); build_srat_memory(numamem, mem_base, mem_len, node, MEM_AFFINITY_ENABLED); } mem_base = HOLE_640K_END; /* ... */ } /* Cut out the ACPI_PCI hole */ if (mem_base <= pcms->below_4g_mem_size && next_base > pcms->below_4g_mem_size) { mem_len -= next_base - pcms->below_4g_mem_size; /* ... *] } numamem = acpi_data_push(table_data, sizeof *numamem); build_srat_memory(numamem, mem_base, mem_len, node, MEM_AFFINITY_ENABLED); /* ... */ } > - numamem = acpi_data_push(table_data, sizeof *numamem); > - build_srat_memory(numamem, mem_base, mem_len, i - 1, > - MEM_AFFINITY_ENABLED); > + mem_base = build_srat_node_entry(table_data, pcms, i, > + mem_base, pcms->node_mem[i]); > } > slots = (table_data->len - numa_start) / sizeof *numamem; > for (; slots < pcms->numa_nodes + 2; slots++) { > -- > 2.5.5 > > > >
Hi Eduardo, [...] >> + continue; >> } > > Now the nodes will be out of order, if node 0 has no RAM. Why? > Because the code parsed the other node with RAM first, then parsed the node 0. > Why not implement this in the same way the PCI 4GB hole is > already implemented. e.g.: > Indeed, it is better and more refined. I will do it in the next version. > #define HOLE_640K_START (640 * 1024) > #define HOLE_640K_END (1024 * 1024) > > for (node = 0; node < pcms->numa_nodes; node++) { > mem_base = next_base; > mem_len = pcms->node_mem[node]; > next_base = mem_base + mem_len; > > /* Cut out the 640K hole */ > if (mem_base <= HOLE_640K_START && > next_base > HOLE_640K_START) { > mem_len -= next_base - HOLE_640K; > if (mem_len > 0) { > numamem = acpi_data_push(table_data, sizeof *numamem); > build_srat_memory(numamem, mem_base, mem_len, node, > MEM_AFFINITY_ENABLED); > } > mem_base = HOLE_640K_END; > /* ... */ BTW, in case 0 |-----------|-------|--------|------- | | | | mem_base 640K next_base 1M I wanna add a check here, /* Check for the rare case: 640K < RAM < 1M */ if (next_base <= HOLE_640K_END) { next_base = HOLE_640K_END; continue; } mem_base = HOLE_640K_END; mem_len = next_base - HOLE_640K_END; I guess no one would set a node with this less RAM, So, Is that necessary? Thanks, dou. > } > /* Cut out the ACPI_PCI hole */ > if (mem_base <= pcms->below_4g_mem_size && > next_base > pcms->below_4g_mem_size) { > mem_len -= next_base - pcms->below_4g_mem_size; > /* ... *] > } > numamem = acpi_data_push(table_data, sizeof *numamem); > build_srat_memory(numamem, mem_base, mem_len, node, > MEM_AFFINITY_ENABLED); > /* ... */ > } > >> - numamem = acpi_data_push(table_data, sizeof *numamem); >> - build_srat_memory(numamem, mem_base, mem_len, i - 1, >> - MEM_AFFINITY_ENABLED); >> + mem_base = build_srat_node_entry(table_data, pcms, i, >> + mem_base, pcms->node_mem[i]); >> } >> slots = (table_data->len - numa_start) / sizeof *numamem; >> for (; slots < pcms->numa_nodes + 2; slots++) { >> -- >> 2.5.5 >> >> >> >> >
Hi Eduardo, At 08/31/2017 06:38 PM, Dou Liyang wrote: > Hi Eduardo, > > [...] >>> + continue; >>> } >> >> Now the nodes will be out of order, if node 0 has no RAM. Why? >> > > Because the code parsed the other node with RAM first, then parsed the > node 0. > >> Why not implement this in the same way the PCI 4GB hole is >> already implemented. e.g.: >> > > Indeed, it is better and more refined. I will do it in the next version. > >> #define HOLE_640K_START (640 * 1024) >> #define HOLE_640K_END (1024 * 1024) >> >> for (node = 0; node < pcms->numa_nodes; node++) { >> mem_base = next_base; >> mem_len = pcms->node_mem[node]; >> next_base = mem_base + mem_len; >> >> /* Cut out the 640K hole */ >> if (mem_base <= HOLE_640K_START && >> next_base > HOLE_640K_START) { >> mem_len -= next_base - HOLE_640K; >> if (mem_len > 0) { >> numamem = acpi_data_push(table_data, sizeof *numamem); >> build_srat_memory(numamem, mem_base, mem_len, node, >> MEM_AFFINITY_ENABLED); >> } >> mem_base = HOLE_640K_END; >> /* ... */ > > BTW, in case > > 0 > |-----------|-------|--------|------- > | | | | > mem_base 640K next_base 1M > > > I wanna add a check here, > > /* Check for the rare case: 640K < RAM < 1M */ > if (next_base <= HOLE_640K_END) { > next_base = HOLE_640K_END; > continue; > } > mem_base = HOLE_640K_END; > mem_len = next_base - HOLE_640K_END; > > I guess no one would set a node with this less RAM, > So, Is that necessary? > I post the V5 patches, and use your code. It may be not very clearly, for the detail, Please see the new version. Thanks, dou. > Thanks, > dou. > >> } >> /* Cut out the ACPI_PCI hole */ >> if (mem_base <= pcms->below_4g_mem_size && >> next_base > pcms->below_4g_mem_size) { >> mem_len -= next_base - pcms->below_4g_mem_size; >> /* ... *] >> } >> numamem = acpi_data_push(table_data, sizeof *numamem); >> build_srat_memory(numamem, mem_base, mem_len, node, >> MEM_AFFINITY_ENABLED); >> /* ... */ >> } >> >>> - numamem = acpi_data_push(table_data, sizeof *numamem); >>> - build_srat_memory(numamem, mem_base, mem_len, i - 1, >>> - MEM_AFFINITY_ENABLED); >>> + mem_base = build_srat_node_entry(table_data, pcms, i, >>> + mem_base, >>> pcms->node_mem[i]); >>> } >>> slots = (table_data->len - numa_start) / sizeof *numamem; >>> for (; slots < pcms->numa_nodes + 2; slots++) { >>> -- >>> 2.5.5 >>> >>> >>> >>> >>
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 98dd424..f93d712 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2318,15 +2318,43 @@ build_tpm2(GArray *table_data, BIOSLinker *linker) (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL); } +static uint64_t +build_srat_node_entry(GArray *table_data, PCMachineState *pcms, + int i, uint64_t mem_base, uint64_t mem_len) +{ + AcpiSratMemoryAffinity *numamem; + uint64_t next_base; + + next_base = mem_base + mem_len; + + /* Cut out the ACPI_PCI hole */ + if (mem_base <= pcms->below_4g_mem_size && + next_base > pcms->below_4g_mem_size) { + mem_len -= next_base - pcms->below_4g_mem_size; + if (mem_len > 0) { + numamem = acpi_data_push(table_data, sizeof *numamem); + build_srat_memory(numamem, mem_base, mem_len, i, + MEM_AFFINITY_ENABLED); + } + mem_base = 1ULL << 32; + mem_len = next_base - pcms->below_4g_mem_size; + next_base += (1ULL << 32) - pcms->below_4g_mem_size; + } + numamem = acpi_data_push(table_data, sizeof *numamem); + build_srat_memory(numamem, mem_base, mem_len, i, + MEM_AFFINITY_ENABLED); + return next_base; +} + static void build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) { AcpiSystemResourceAffinityTable *srat; AcpiSratMemoryAffinity *numamem; - int i; + int i, node; int srat_start, numa_start, slots; - uint64_t mem_len, mem_base, next_base; + uint64_t mem_len, mem_base; MachineClass *mc = MACHINE_GET_CLASS(machine); const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(machine); PCMachineState *pcms = PC_MACHINE(machine); @@ -2370,36 +2398,30 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) /* the memory map is a bit tricky, it contains at least one hole * from 640k-1M and possibly another one from 3.5G-4G. */ - next_base = 0; + numa_start = table_data->len; - numamem = acpi_data_push(table_data, sizeof *numamem); - build_srat_memory(numamem, 0, 640 * 1024, 0, MEM_AFFINITY_ENABLED); - next_base = 1024 * 1024; - for (i = 1; i < pcms->numa_nodes + 1; ++i) { - mem_base = next_base; - mem_len = pcms->node_mem[i - 1]; - if (i == 1) { - mem_len -= 1024 * 1024; + /* get the first node which has memory and map the hole from 640K-1M */ + for (node = 0; node < pcms->numa_nodes; node++) { + if (pcms->node_mem[node] != 0) { + break; } - next_base = mem_base + mem_len; - - /* Cut out the ACPI_PCI hole */ - if (mem_base <= pcms->below_4g_mem_size && - next_base > pcms->below_4g_mem_size) { - mem_len -= next_base - pcms->below_4g_mem_size; - if (mem_len > 0) { - numamem = acpi_data_push(table_data, sizeof *numamem); - build_srat_memory(numamem, mem_base, mem_len, i - 1, - MEM_AFFINITY_ENABLED); - } - mem_base = 1ULL << 32; - mem_len = next_base - pcms->below_4g_mem_size; - next_base += (1ULL << 32) - pcms->below_4g_mem_size; + } + numamem = acpi_data_push(table_data, sizeof *numamem); + build_srat_memory(numamem, 0, 640 * 1024, node, MEM_AFFINITY_ENABLED); + + /* map the rest of memory from 1M */ + mem_base = 1024 * 1024; + mem_len = pcms->node_mem[node] - mem_base; + mem_base = build_srat_node_entry(table_data, pcms, node, + mem_base, mem_len); + + for (i = 0; i < pcms->numa_nodes; i++) { + if (i == node) { + continue; } - numamem = acpi_data_push(table_data, sizeof *numamem); - build_srat_memory(numamem, mem_base, mem_len, i - 1, - MEM_AFFINITY_ENABLED); + mem_base = build_srat_node_entry(table_data, pcms, i, + mem_base, pcms->node_mem[i]); } slots = (table_data->len - numa_start) / sizeof *numamem; for (; slots < pcms->numa_nodes + 2; slots++) {
Currently, Using the fisrt node without memory on the machine makes QEMU unhappy. With this example command line: ... \ -m 1024M,slots=4,maxmem=32G \ -numa node,nodeid=0 \ -numa node,mem=1024M,nodeid=1 \ -numa node,nodeid=2 \ -numa node,nodeid=3 \ Guest reports "No NUMA configuration found" and the NUMA topology is wrong. This is because when QEMU builds ACPI SRAT, it regards node0 as the default node to deal with the memory hole(640K-1M). this means the node0 must have some memory(>1M), but, actually it can have no memory. Fix this problem by replace the node0 with the first node which has memory on it. Add a new function for each node. Also do some cleanup. Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com> --- hw/i386/acpi-build.c | 78 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 28 deletions(-)