@@ -2753,6 +2753,11 @@ static void spapr_machine_init(MachineState *machine)
spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
+ /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */
+ if (!smc->pre_6_2_numa_affinity) {
+ spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY);
+ }
+
/* advertise support for dedicated HP event source to guests */
if (spapr->use_hotplug_event_source) {
spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
@@ -4675,8 +4680,11 @@ DEFINE_SPAPR_MACHINE(6_2, "6.2", true);
*/
static void spapr_machine_6_1_class_options(MachineClass *mc)
{
+ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+
spapr_machine_6_2_class_options(mc);
compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
+ smc->pre_6_2_numa_affinity = true;
}
DEFINE_SPAPR_MACHINE(6_1, "6.1", false);
@@ -24,6 +24,10 @@
*/
static int get_max_dist_ref_points(SpaprMachineState *spapr)
{
+ if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
+ return FORM2_DIST_REF_POINTS;
+ }
+
return FORM1_DIST_REF_POINTS;
}
@@ -32,6 +36,10 @@ static int get_max_dist_ref_points(SpaprMachineState *spapr)
*/
static int get_numa_assoc_size(SpaprMachineState *spapr)
{
+ if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
+ return FORM2_NUMA_ASSOC_SIZE;
+ }
+
return FORM1_NUMA_ASSOC_SIZE;
}
@@ -52,6 +60,9 @@ static int get_vcpu_assoc_size(SpaprMachineState *spapr)
*/
static const uint32_t *get_associativity(SpaprMachineState *spapr, int node_id)
{
+ if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
+ return spapr->FORM2_assoc_array[node_id];
+ }
return spapr->FORM1_assoc_array[node_id];
}
@@ -295,14 +306,50 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr,
spapr_numa_define_FORM1_domains(spapr);
}
+/*
+ * Init NUMA FORM2 machine state data
+ */
+static void spapr_numa_FORM2_affinity_init(SpaprMachineState *spapr)
+{
+ int i;
+
+ /*
+ * For all resources but CPUs, FORM2 associativity arrays will
+ * be a size 2 array with the following format:
+ *
+ * ibm,associativity = {1, numa_id}
+ *
+ * CPUs will write an additional 'vcpu_id' on top of the arrays
+ * being initialized here. 'numa_id' is represented by the
+ * index 'i' of the loop.
+ *
+ * Given that this initialization is also valid for GPU associativity
+ * arrays, handle everything in one single step by populating the
+ * arrays up to NUMA_NODES_MAX_NUM.
+ */
+ for (i = 0; i < NUMA_NODES_MAX_NUM; i++) {
+ spapr->FORM2_assoc_array[i][0] = cpu_to_be32(1);
+ spapr->FORM2_assoc_array[i][1] = cpu_to_be32(i);
+ }
+}
+
void spapr_numa_associativity_init(SpaprMachineState *spapr,
MachineState *machine)
{
spapr_numa_FORM1_affinity_init(spapr, machine);
+ spapr_numa_FORM2_affinity_init(spapr);
}
void spapr_numa_associativity_check(SpaprMachineState *spapr)
{
+ /*
+ * FORM2 does not have any restrictions we need to handle
+ * at CAS time, for now.
+ */
+ if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
+ return;
+ }
+
spapr_numa_FORM1_affinity_check(MACHINE(spapr));
}
@@ -447,6 +494,100 @@ static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr,
maxdomains, sizeof(maxdomains)));
}
+static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr,
+ void *fdt, int rtas)
+{
+ MachineState *ms = MACHINE(spapr);
+ NodeInfo *numa_info = ms->numa_state->nodes;
+ int nb_numa_nodes = ms->numa_state->num_nodes;
+ int distance_table_entries = nb_numa_nodes * nb_numa_nodes;
+ g_autofree uint32_t *lookup_index_table = NULL;
+ g_autofree uint32_t *distance_table = NULL;
+ int src, dst, i, distance_table_size;
+ uint8_t *node_distances;
+
+ /*
+ * ibm,numa-lookup-index-table: array with length and a
+ * list of NUMA ids present in the guest.
+ */
+ lookup_index_table = g_new0(uint32_t, nb_numa_nodes + 1);
+ lookup_index_table[0] = cpu_to_be32(nb_numa_nodes);
+
+ for (i = 0; i < nb_numa_nodes; i++) {
+ lookup_index_table[i + 1] = cpu_to_be32(i);
+ }
+
+ _FDT(fdt_setprop(fdt, rtas, "ibm,numa-lookup-index-table",
+ lookup_index_table,
+ (nb_numa_nodes + 1) * sizeof(uint32_t)));
+
+ /*
+ * ibm,numa-distance-table: contains all node distances. First
+ * element is the size of the table as uint32, followed up
+ * by all the uint8 distances from the first NUMA node, then all
+ * distances from the second NUMA node and so on.
+ *
+ * ibm,numa-lookup-index-table is used by guest to navigate this
+ * array because NUMA ids can be sparse (node 0 is the first,
+ * node 8 is the second ...).
+ */
+ distance_table = g_new0(uint32_t, distance_table_entries + 1);
+ distance_table[0] = cpu_to_be32(distance_table_entries);
+
+ node_distances = (uint8_t *)&distance_table[1];
+ i = 0;
+
+ for (src = 0; src < nb_numa_nodes; src++) {
+ for (dst = 0; dst < nb_numa_nodes; dst++) {
+ node_distances[i++] = numa_info[src].distance[dst];
+ }
+ }
+
+ distance_table_size = distance_table_entries * sizeof(uint8_t) +
+ sizeof(uint32_t);
+ _FDT(fdt_setprop(fdt, rtas, "ibm,numa-distance-table",
+ distance_table, distance_table_size));
+}
+
+/*
+ * This helper could be compressed in a single function with
+ * FORM1 logic since we're setting the same DT values, with the
+ * difference being a call to spapr_numa_FORM2_write_rtas_tables()
+ * in the end. The separation was made to avoid clogging FORM1 code
+ * which already has to deal with compat modes from previous
+ * QEMU machine types.
+ */
+static void spapr_numa_FORM2_write_rtas_dt(SpaprMachineState *spapr,
+ void *fdt, int rtas)
+{
+ MachineState *ms = MACHINE(spapr);
+ uint32_t number_nvgpus_nodes = spapr->gpu_numa_id -
+ spapr_numa_initial_nvgpu_numa_id(ms);
+
+ /*
+ * In FORM2, ibm,associativity-reference-points will point to
+ * the element in the ibm,associativity array that contains the
+ * primary domain index (for FORM2, the first element).
+ *
+ * This value (in our case, the numa-id) is then used as an index
+ * to retrieve all other attributes of the node (distance,
+ * bandwidth, latency) via ibm,numa-lookup-index-table and other
+ * ibm,numa-*-table properties.
+ */
+ uint32_t refpoints[] = { cpu_to_be32(1) };
+
+ uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes;
+ uint32_t maxdomains[] = { cpu_to_be32(1), cpu_to_be32(maxdomain) };
+
+ _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
+ refpoints, sizeof(refpoints)));
+
+ _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
+ maxdomains, sizeof(maxdomains)));
+
+ spapr_numa_FORM2_write_rtas_tables(spapr, fdt, rtas);
+}
+
/*
* Helper that writes ibm,associativity-reference-points and
* max-associativity-domains in the RTAS pointed by @rtas
@@ -454,6 +595,11 @@ static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr,
*/
void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas)
{
+ if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
+ spapr_numa_FORM2_write_rtas_dt(spapr, fdt, rtas);
+ return;
+ }
+
spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas);
}
@@ -118,6 +118,13 @@ typedef enum {
#define FORM1_DIST_REF_POINTS 4
#define FORM1_NUMA_ASSOC_SIZE (FORM1_DIST_REF_POINTS + 1)
+/*
+ * FORM2 NUMA affinity has a single associativity domain, giving
+ * us a assoc size of 2.
+ */
+#define FORM2_DIST_REF_POINTS 1
+#define FORM2_NUMA_ASSOC_SIZE (FORM2_DIST_REF_POINTS + 1)
+
typedef struct SpaprCapabilities SpaprCapabilities;
struct SpaprCapabilities {
uint8_t caps[SPAPR_CAP_NUM];
@@ -145,6 +152,7 @@ struct SpaprMachineClass {
hwaddr rma_limit; /* clamp the RMA to this size */
bool pre_5_1_assoc_refpoints;
bool pre_5_2_numa_associativity;
+ bool pre_6_2_numa_affinity;
bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
@@ -250,6 +258,7 @@ struct SpaprMachineState {
SpaprTpmProxy *tpm_proxy;
uint32_t FORM1_assoc_array[NUMA_NODES_MAX_NUM][FORM1_NUMA_ASSOC_SIZE];
+ uint32_t FORM2_assoc_array[NUMA_NODES_MAX_NUM][FORM2_NUMA_ASSOC_SIZE];
Error *fwnmi_migration_blocker;
};
@@ -49,6 +49,7 @@ typedef struct SpaprOptionVector SpaprOptionVector;
/* option vector 5 */
#define OV5_DRCONF_MEMORY OV_BIT(2, 2)
#define OV5_FORM1_AFFINITY OV_BIT(5, 0)
+#define OV5_FORM2_AFFINITY OV_BIT(5, 2)
#define OV5_HP_EVT OV_BIT(6, 5)
#define OV5_HPT_RESIZE OV_BIT(6, 7)
#define OV5_DRMEM_V2 OV_BIT(22, 0)