Message ID | 20220718124315.16648-3-tariqt@nvidia.com (mailing list archive) |
---|---|
State | Awaiting Upstream |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | mlx5: Use NUMA distance metrics | expand |
On Mon, Jul 18, 2022 at 03:43:15PM +0300, Tariq Toukan wrote: > Reviewed-by: Gal Pressman <gal@nvidia.com> > Acked-by: Saeed Mahameed <saeedm@nvidia.com> > Signed-off-by: Tariq Toukan <tariqt@nvidia.com> > --- > drivers/net/ethernet/mellanox/mlx5/core/eq.c | 62 +++++++++++++++++++- > 1 file changed, 59 insertions(+), 3 deletions(-) > > v2: > Separated the set_cpu operation into two functions, per Saeed's suggestion. > Added Saeed's Acked-by signature. > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c > index 229728c80233..e72bdaaad84f 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c > @@ -11,6 +11,9 @@ > #ifdef CONFIG_RFS_ACCEL > #include <linux/cpu_rmap.h> > #endif > +#ifdef CONFIG_NUMA > +#include <linux/sched/topology.h> > +#endif > #include "mlx5_core.h" > #include "lib/eq.h" > #include "fpga/core.h" > @@ -806,13 +809,67 @@ static void comp_irqs_release(struct mlx5_core_dev *dev) > kfree(table->comp_irqs); > } > > +static void set_cpus_by_local_spread(struct mlx5_core_dev *dev, u16 *cpus, > + int ncomp_eqs) > +{ > + int i; > + > + for (i = 0; i < ncomp_eqs; i++) > + cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); > +} > + > +static bool set_cpus_by_numa_distance(struct mlx5_core_dev *dev, u16 *cpus, > + int ncomp_eqs) > +{ > +#ifdef CONFIG_NUMA > + cpumask_var_t cpumask; > + int first; > + int i; > + > + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) { > + mlx5_core_err(dev, "zalloc_cpumask_var failed\n"); > + return false; > + } > + cpumask_copy(cpumask, cpu_online_mask); > + > + first = cpumask_local_spread(0, dev->priv.numa_node); Arguably you want something like: first = cpumask_any(cpumask_of_node(dev->priv.numa_node)); > + > + for (i = 0; i < ncomp_eqs; i++) { > + int cpu; > + > + cpu = sched_numa_find_closest(cpumask, first); > + if (cpu >= nr_cpu_ids) { > + mlx5_core_err(dev, "sched_numa_find_closest failed, cpu(%d) >= nr_cpu_ids(%d)\n", > + cpu, nr_cpu_ids); > + > + free_cpumask_var(cpumask); > + return false; So this will fail when ncomp_eqs > cpumask_weight(online_cpus); is that desired? > + } > + cpus[i] = cpu; > + cpumask_clear_cpu(cpu, cpumask); Since there is no concurrency on this cpumask, you don't need atomic ops: __cpumask_clear_cpu(..); > + } > + > + free_cpumask_var(cpumask); > + return true; > +#else > + return false; > +#endif > +} > + > +static void mlx5_set_eqs_cpus(struct mlx5_core_dev *dev, u16 *cpus, int ncomp_eqs) > +{ > + bool success = set_cpus_by_numa_distance(dev, cpus, ncomp_eqs); > + > + if (!success) > + set_cpus_by_local_spread(dev, cpus, ncomp_eqs); > +} > + > static int comp_irqs_request(struct mlx5_core_dev *dev) > { > struct mlx5_eq_table *table = dev->priv.eq_table; > int ncomp_eqs = table->num_comp_eqs; > u16 *cpus; > int ret; > - int i; > > ncomp_eqs = table->num_comp_eqs; > table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL); > @@ -830,8 +887,7 @@ static int comp_irqs_request(struct mlx5_core_dev *dev) > ret = -ENOMEM; > goto free_irqs; > } > - for (i = 0; i < ncomp_eqs; i++) > - cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); > + mlx5_set_eqs_cpus(dev, cpus, ncomp_eqs); So you change this for mlx5, what about the other users of cpumask_local_spread() ?
On 7/18/2022 4:50 PM, Peter Zijlstra wrote: > On Mon, Jul 18, 2022 at 03:43:15PM +0300, Tariq Toukan wrote: > >> Reviewed-by: Gal Pressman <gal@nvidia.com> >> Acked-by: Saeed Mahameed <saeedm@nvidia.com> >> Signed-off-by: Tariq Toukan <tariqt@nvidia.com> >> --- >> drivers/net/ethernet/mellanox/mlx5/core/eq.c | 62 +++++++++++++++++++- >> 1 file changed, 59 insertions(+), 3 deletions(-) >> >> v2: >> Separated the set_cpu operation into two functions, per Saeed's suggestion. >> Added Saeed's Acked-by signature. >> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c >> index 229728c80233..e72bdaaad84f 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c >> @@ -11,6 +11,9 @@ >> #ifdef CONFIG_RFS_ACCEL >> #include <linux/cpu_rmap.h> >> #endif >> +#ifdef CONFIG_NUMA >> +#include <linux/sched/topology.h> >> +#endif >> #include "mlx5_core.h" >> #include "lib/eq.h" >> #include "fpga/core.h" >> @@ -806,13 +809,67 @@ static void comp_irqs_release(struct mlx5_core_dev *dev) >> kfree(table->comp_irqs); >> } >> >> +static void set_cpus_by_local_spread(struct mlx5_core_dev *dev, u16 *cpus, >> + int ncomp_eqs) >> +{ >> + int i; >> + >> + for (i = 0; i < ncomp_eqs; i++) >> + cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); >> +} >> + >> +static bool set_cpus_by_numa_distance(struct mlx5_core_dev *dev, u16 *cpus, >> + int ncomp_eqs) >> +{ >> +#ifdef CONFIG_NUMA >> + cpumask_var_t cpumask; >> + int first; >> + int i; >> + >> + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) { >> + mlx5_core_err(dev, "zalloc_cpumask_var failed\n"); >> + return false; >> + } >> + cpumask_copy(cpumask, cpu_online_mask); >> + >> + first = cpumask_local_spread(0, dev->priv.numa_node); > > Arguably you want something like: > > first = cpumask_any(cpumask_of_node(dev->priv.numa_node)); Any doesn't sound like what I'm looking for, I'm looking for first. I do care about the order within the node, so it's more like cpumask_first(cpumask_of_node(dev->priv.numa_node)); Do you think this has any advantage over cpumask_local_spread, if used only during the setup phase of the driver? > >> + >> + for (i = 0; i < ncomp_eqs; i++) { >> + int cpu; >> + >> + cpu = sched_numa_find_closest(cpumask, first); >> + if (cpu >= nr_cpu_ids) { >> + mlx5_core_err(dev, "sched_numa_find_closest failed, cpu(%d) >= nr_cpu_ids(%d)\n", >> + cpu, nr_cpu_ids); >> + >> + free_cpumask_var(cpumask); >> + return false; > > So this will fail when ncomp_eqs > cpumask_weight(online_cpus); is that > desired? > Yes. ncomp_eqs does not exceed the num of online cores. >> + } >> + cpus[i] = cpu; >> + cpumask_clear_cpu(cpu, cpumask); > > Since there is no concurrency on this cpumask, you don't need atomic > ops: > > __cpumask_clear_cpu(..); > Right. I'll fix. >> + } >> + >> + free_cpumask_var(cpumask); >> + return true; >> +#else >> + return false; >> +#endif >> +} >> + >> +static void mlx5_set_eqs_cpus(struct mlx5_core_dev *dev, u16 *cpus, int ncomp_eqs) >> +{ >> + bool success = set_cpus_by_numa_distance(dev, cpus, ncomp_eqs); >> + >> + if (!success) >> + set_cpus_by_local_spread(dev, cpus, ncomp_eqs); >> +} >> + >> static int comp_irqs_request(struct mlx5_core_dev *dev) >> { >> struct mlx5_eq_table *table = dev->priv.eq_table; >> int ncomp_eqs = table->num_comp_eqs; >> u16 *cpus; >> int ret; >> - int i; >> >> ncomp_eqs = table->num_comp_eqs; >> table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL); >> @@ -830,8 +887,7 @@ static int comp_irqs_request(struct mlx5_core_dev *dev) >> ret = -ENOMEM; >> goto free_irqs; >> } >> - for (i = 0; i < ncomp_eqs; i++) >> - cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); >> + mlx5_set_eqs_cpus(dev, cpus, ncomp_eqs); > > So you change this for mlx5, what about the other users of > cpumask_local_spread() ? I took a look at the different netdev users. While some users have similar use case to ours (affinity hints), many others use cpumask_local_spread in other flows (XPS setting, ring allocations, etc..). Moving them to use the newly exposed API needs some deeper dive into their code, especially due to the possible undesired side-effects. I prefer not to include these changes in my series for now, but probably contribute it in a followup work. Regards, Tariq
On Mon, Jul 18, 2022 at 10:49:21PM +0300, Tariq Toukan wrote: > > > + first = cpumask_local_spread(0, dev->priv.numa_node); > > > > Arguably you want something like: > > > > first = cpumask_any(cpumask_of_node(dev->priv.numa_node)); > > Any doesn't sound like what I'm looking for, I'm looking for first. > I do care about the order within the node, so it's more like > cpumask_first(cpumask_of_node(dev->priv.numa_node)); > > Do you think this has any advantage over cpumask_local_spread, if used only > during the setup phase of the driver? Only for the poor sod trying to read this code ;-) That is, I had no idea what cpumask_local_spread() does, while cpumask_first() is fairly obvious. > > > @@ -830,8 +887,7 @@ static int comp_irqs_request(struct mlx5_core_dev *dev) > > > ret = -ENOMEM; > > > goto free_irqs; > > > } > > > - for (i = 0; i < ncomp_eqs; i++) > > > - cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); > > > + mlx5_set_eqs_cpus(dev, cpus, ncomp_eqs); > > > > So you change this for mlx5, what about the other users of > > cpumask_local_spread() ? > > I took a look at the different netdev users. > While some users have similar use case to ours (affinity hints), many others > use cpumask_local_spread in other flows (XPS setting, ring allocations, > etc..). > > Moving them to use the newly exposed API needs some deeper dive into their > code, especially due to the possible undesired side-effects. > > I prefer not to include these changes in my series for now, but probably > contribute it in a followup work. Fair enough.
On Mon, 18 Jul 2022 22:49:21 +0300 Tariq Toukan wrote: > >> @@ -830,8 +887,7 @@ static int comp_irqs_request(struct mlx5_core_dev *dev) > >> ret = -ENOMEM; > >> goto free_irqs; > >> } > >> - for (i = 0; i < ncomp_eqs; i++) > >> - cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); > >> + mlx5_set_eqs_cpus(dev, cpus, ncomp_eqs); > > > > So you change this for mlx5, what about the other users of > > cpumask_local_spread() ? > > I took a look at the different netdev users. > While some users have similar use case to ours (affinity hints), many > others use cpumask_local_spread in other flows (XPS setting, ring > allocations, etc..). > > Moving them to use the newly exposed API needs some deeper dive into > their code, especially due to the possible undesired side-effects. > > I prefer not to include these changes in my series for now, but probably > contribute it in a followup work. I'd be great if you could pick any other driver and start creating the right APIs for it and mlx5. "Probably contribute followup work" does not inspire confidence. And yes, I am being picky, I'm holding a grudge against mlx5 for not using netif_get_num_default_rss_queues().
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 229728c80233..e72bdaaad84f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -11,6 +11,9 @@ #ifdef CONFIG_RFS_ACCEL #include <linux/cpu_rmap.h> #endif +#ifdef CONFIG_NUMA +#include <linux/sched/topology.h> +#endif #include "mlx5_core.h" #include "lib/eq.h" #include "fpga/core.h" @@ -806,13 +809,67 @@ static void comp_irqs_release(struct mlx5_core_dev *dev) kfree(table->comp_irqs); } +static void set_cpus_by_local_spread(struct mlx5_core_dev *dev, u16 *cpus, + int ncomp_eqs) +{ + int i; + + for (i = 0; i < ncomp_eqs; i++) + cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); +} + +static bool set_cpus_by_numa_distance(struct mlx5_core_dev *dev, u16 *cpus, + int ncomp_eqs) +{ +#ifdef CONFIG_NUMA + cpumask_var_t cpumask; + int first; + int i; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) { + mlx5_core_err(dev, "zalloc_cpumask_var failed\n"); + return false; + } + cpumask_copy(cpumask, cpu_online_mask); + + first = cpumask_local_spread(0, dev->priv.numa_node); + + for (i = 0; i < ncomp_eqs; i++) { + int cpu; + + cpu = sched_numa_find_closest(cpumask, first); + if (cpu >= nr_cpu_ids) { + mlx5_core_err(dev, "sched_numa_find_closest failed, cpu(%d) >= nr_cpu_ids(%d)\n", + cpu, nr_cpu_ids); + + free_cpumask_var(cpumask); + return false; + } + cpus[i] = cpu; + cpumask_clear_cpu(cpu, cpumask); + } + + free_cpumask_var(cpumask); + return true; +#else + return false; +#endif +} + +static void mlx5_set_eqs_cpus(struct mlx5_core_dev *dev, u16 *cpus, int ncomp_eqs) +{ + bool success = set_cpus_by_numa_distance(dev, cpus, ncomp_eqs); + + if (!success) + set_cpus_by_local_spread(dev, cpus, ncomp_eqs); +} + static int comp_irqs_request(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = dev->priv.eq_table; int ncomp_eqs = table->num_comp_eqs; u16 *cpus; int ret; - int i; ncomp_eqs = table->num_comp_eqs; table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL); @@ -830,8 +887,7 @@ static int comp_irqs_request(struct mlx5_core_dev *dev) ret = -ENOMEM; goto free_irqs; } - for (i = 0; i < ncomp_eqs; i++) - cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); + mlx5_set_eqs_cpus(dev, cpus, ncomp_eqs); ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs); kfree(cpus); if (ret < 0)