diff mbox series

lightnvm: pblk: set write thread affinity to particular cpu

Message ID 1552573079-12712-1-git-send-email-marcin.dziegielewski@intel.com (mailing list archive)
State New, archived
Headers show
Series lightnvm: pblk: set write thread affinity to particular cpu | expand

Commit Message

Marcin Dziegielewski March 14, 2019, 2:17 p.m. UTC
In some cases write thread migration between cpus can cause
sending writes in improper order and in consequence a lot of
errors from device.

Write thread affinity to particular cpu prevent before it.

Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
---
 drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
 drivers/lightnvm/pblk.h      |  1 +
 drivers/nvme/host/lightnvm.c |  1 +
 include/linux/lightnvm.h     |  2 ++
 4 files changed, 33 insertions(+), 1 deletion(-)

Comments

Javier González March 14, 2019, 2:22 p.m. UTC | #1
> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
> 
> In some cases write thread migration between cpus can cause
> sending writes in improper order and in consequence a lot of
> errors from device.
> 
> Write thread affinity to particular cpu prevent before it.
> 
> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
> ---
> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
> drivers/lightnvm/pblk.h      |  1 +
> drivers/nvme/host/lightnvm.c |  1 +
> include/linux/lightnvm.h     |  2 ++
> 4 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index 81e8ed4..bd25004 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -21,6 +21,8 @@
> 
> #include "pblk.h"
> #include "pblk-trace.h"
> +#include <linux/cpumask.h>
> +#include <linux/numa.h>
> 
> static unsigned int write_buffer_size;
> 
> @@ -47,6 +49,8 @@ struct pblk_global_caches {
> 
> struct bio_set pblk_bio_set;
> 
> +cpumask_t free_cpumask;
> +
> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
> 			  struct bio *bio)
> {
> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
> 
> static int pblk_writer_init(struct pblk *pblk)
> {
> +	cpumask_t tmp_cpumask, cpumask;
> +	int cpu;
> +
> 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
> 	if (IS_ERR(pblk->writer_ts)) {
> 		int err = PTR_ERR(pblk->writer_ts);
> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
> 		return err;
> 	}
> 
> +	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
> +			cpu_online_mask);
> +	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> +
> +	if (!cpumask_weight(&free_cpumask)) {
> +		free_cpumask = CPU_MASK_ALL;
> +		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> +	}
> +
> +	cpu = cpumask_last(&cpumask);
> +
> +	kthread_bind(pblk->writer_ts, cpu);
> +
> +	cpumask_clear_cpu(cpu, &free_cpumask);
> +	pblk->writer_cpu = cpu;
> +
> 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
> 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
> 
> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
> 			"Stopping not fully synced write buffer\n");
> 
> 	del_timer_sync(&pblk->wtimer);
> -	if (pblk->writer_ts)
> +	if (pblk->writer_ts) {
> +		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
> 		kthread_stop(pblk->writer_ts);
> +		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
> +	}
> }
> 
> static void pblk_free(struct pblk *pblk)
> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
> {
> 	int ret;
> 
> +	free_cpumask = CPU_MASK_ALL;
> +
> 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
> 	if (ret)
> 		return ret;
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index 381f074..650f983 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -690,6 +690,7 @@ struct pblk {
> 	atomic_t inflight_io;		/* General inflight I/O counter */
> 
> 	struct task_struct *writer_ts;
> +	int writer_cpu;
> 
> 	/* Simple translation map of logical addresses to physical addresses.
> 	 * The logical addresses is known by the host system, while the physical
> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
> index 949e29e..971a19f 100644
> --- a/drivers/nvme/host/lightnvm.c
> +++ b/drivers/nvme/host/lightnvm.c
> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
> 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
> 	dev->ops = &nvme_nvm_dev_ops;
> 	dev->private_data = ns;
> +	dev->node = node;
> 	ns->ndev = dev;
> 
> 	return nvm_register(dev);
> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
> index 5d865a5..312029e 100644
> --- a/include/linux/lightnvm.h
> +++ b/include/linux/lightnvm.h
> @@ -427,6 +427,8 @@ struct nvm_dev {
> 	char name[DISK_NAME_LEN];
> 	void *private_data;
> 
> +	int node;
> +
> 	void *rmap;
> 
> 	struct mutex mlock;
> --
> 1.8.3.1

We have a per-CPU semaphore that only allows to send a single I/O in
order to prevent write pointer violations. Are you seeing this error, or
is it theoretical?

Javier
Javier González March 14, 2019, 2:22 p.m. UTC | #2
> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
> 
>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>> 
>> In some cases write thread migration between cpus can cause
>> sending writes in improper order and in consequence a lot of
>> errors from device.
>> 
>> Write thread affinity to particular cpu prevent before it.
>> 
>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>> ---
>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>> drivers/lightnvm/pblk.h      |  1 +
>> drivers/nvme/host/lightnvm.c |  1 +
>> include/linux/lightnvm.h     |  2 ++
>> 4 files changed, 33 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>> index 81e8ed4..bd25004 100644
>> --- a/drivers/lightnvm/pblk-init.c
>> +++ b/drivers/lightnvm/pblk-init.c
>> @@ -21,6 +21,8 @@
>> 
>> #include "pblk.h"
>> #include "pblk-trace.h"
>> +#include <linux/cpumask.h>
>> +#include <linux/numa.h>
>> 
>> static unsigned int write_buffer_size;
>> 
>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>> 
>> struct bio_set pblk_bio_set;
>> 
>> +cpumask_t free_cpumask;
>> +
>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>> 			  struct bio *bio)
>> {
>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>> 
>> static int pblk_writer_init(struct pblk *pblk)
>> {
>> +	cpumask_t tmp_cpumask, cpumask;
>> +	int cpu;
>> +
>> 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>> 	if (IS_ERR(pblk->writer_ts)) {
>> 		int err = PTR_ERR(pblk->writer_ts);
>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>> 		return err;
>> 	}
>> 
>> +	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>> +			cpu_online_mask);
>> +	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>> +
>> +	if (!cpumask_weight(&free_cpumask)) {
>> +		free_cpumask = CPU_MASK_ALL;
>> +		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>> +	}
>> +
>> +	cpu = cpumask_last(&cpumask);
>> +
>> +	kthread_bind(pblk->writer_ts, cpu);
>> +
>> +	cpumask_clear_cpu(cpu, &free_cpumask);
>> +	pblk->writer_cpu = cpu;
>> +
>> 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>> 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>> 
>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>> 			"Stopping not fully synced write buffer\n");
>> 
>> 	del_timer_sync(&pblk->wtimer);
>> -	if (pblk->writer_ts)
>> +	if (pblk->writer_ts) {
>> +		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>> 		kthread_stop(pblk->writer_ts);
>> +		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>> +	}
>> }
>> 
>> static void pblk_free(struct pblk *pblk)
>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>> {
>> 	int ret;
>> 
>> +	free_cpumask = CPU_MASK_ALL;
>> +
>> 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>> 	if (ret)
>> 		return ret;
>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>> index 381f074..650f983 100644
>> --- a/drivers/lightnvm/pblk.h
>> +++ b/drivers/lightnvm/pblk.h
>> @@ -690,6 +690,7 @@ struct pblk {
>> 	atomic_t inflight_io;		/* General inflight I/O counter */
>> 
>> 	struct task_struct *writer_ts;
>> +	int writer_cpu;
>> 
>> 	/* Simple translation map of logical addresses to physical addresses.
>> 	 * The logical addresses is known by the host system, while the physical
>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>> index 949e29e..971a19f 100644
>> --- a/drivers/nvme/host/lightnvm.c
>> +++ b/drivers/nvme/host/lightnvm.c
>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>> 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
>> 	dev->ops = &nvme_nvm_dev_ops;
>> 	dev->private_data = ns;
>> +	dev->node = node;
>> 	ns->ndev = dev;
>> 
>> 	return nvm_register(dev);
>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>> index 5d865a5..312029e 100644
>> --- a/include/linux/lightnvm.h
>> +++ b/include/linux/lightnvm.h
>> @@ -427,6 +427,8 @@ struct nvm_dev {
>> 	char name[DISK_NAME_LEN];
>> 	void *private_data;
>> 
>> +	int node;
>> +
>> 	void *rmap;
>> 
>> 	struct mutex mlock;
>> --
>> 1.8.3.1
> 
> We have a per-CPU semaphore that only allows to send a single I/O in
> order to prevent write pointer violations. Are you seeing this error, or
> is it theoretical?
> 
> Javier

I meant per PU (parallel unit)…
Marcin Dziegielewski March 18, 2019, 9:44 a.m. UTC | #3
On 3/14/19 3:22 PM, Javier González wrote:
> 
>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>
>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>
>>> In some cases write thread migration between cpus can cause
>>> sending writes in improper order and in consequence a lot of
>>> errors from device.
>>>
>>> Write thread affinity to particular cpu prevent before it.
>>>
>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>> ---
>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>> drivers/lightnvm/pblk.h      |  1 +
>>> drivers/nvme/host/lightnvm.c |  1 +
>>> include/linux/lightnvm.h     |  2 ++
>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>> index 81e8ed4..bd25004 100644
>>> --- a/drivers/lightnvm/pblk-init.c
>>> +++ b/drivers/lightnvm/pblk-init.c
>>> @@ -21,6 +21,8 @@
>>>
>>> #include "pblk.h"
>>> #include "pblk-trace.h"
>>> +#include <linux/cpumask.h>
>>> +#include <linux/numa.h>
>>>
>>> static unsigned int write_buffer_size;
>>>
>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>
>>> struct bio_set pblk_bio_set;
>>>
>>> +cpumask_t free_cpumask;
>>> +
>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>> 			  struct bio *bio)
>>> {
>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>
>>> static int pblk_writer_init(struct pblk *pblk)
>>> {
>>> +	cpumask_t tmp_cpumask, cpumask;
>>> +	int cpu;
>>> +
>>> 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>> 	if (IS_ERR(pblk->writer_ts)) {
>>> 		int err = PTR_ERR(pblk->writer_ts);
>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>> 		return err;
>>> 	}
>>>
>>> +	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>> +			cpu_online_mask);
>>> +	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>> +
>>> +	if (!cpumask_weight(&free_cpumask)) {
>>> +		free_cpumask = CPU_MASK_ALL;
>>> +		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>> +	}
>>> +
>>> +	cpu = cpumask_last(&cpumask);
>>> +
>>> +	kthread_bind(pblk->writer_ts, cpu);
>>> +
>>> +	cpumask_clear_cpu(cpu, &free_cpumask);
>>> +	pblk->writer_cpu = cpu;
>>> +
>>> 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>> 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>
>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>> 			"Stopping not fully synced write buffer\n");
>>>
>>> 	del_timer_sync(&pblk->wtimer);
>>> -	if (pblk->writer_ts)
>>> +	if (pblk->writer_ts) {
>>> +		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>> 		kthread_stop(pblk->writer_ts);
>>> +		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>> +	}
>>> }
>>>
>>> static void pblk_free(struct pblk *pblk)
>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>> {
>>> 	int ret;
>>>
>>> +	free_cpumask = CPU_MASK_ALL;
>>> +
>>> 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>> 	if (ret)
>>> 		return ret;
>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>> index 381f074..650f983 100644
>>> --- a/drivers/lightnvm/pblk.h
>>> +++ b/drivers/lightnvm/pblk.h
>>> @@ -690,6 +690,7 @@ struct pblk {
>>> 	atomic_t inflight_io;		/* General inflight I/O counter */
>>>
>>> 	struct task_struct *writer_ts;
>>> +	int writer_cpu;
>>>
>>> 	/* Simple translation map of logical addresses to physical addresses.
>>> 	 * The logical addresses is known by the host system, while the physical
>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>> index 949e29e..971a19f 100644
>>> --- a/drivers/nvme/host/lightnvm.c
>>> +++ b/drivers/nvme/host/lightnvm.c
>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>> 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>> 	dev->ops = &nvme_nvm_dev_ops;
>>> 	dev->private_data = ns;
>>> +	dev->node = node;
>>> 	ns->ndev = dev;
>>>
>>> 	return nvm_register(dev);
>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>> index 5d865a5..312029e 100644
>>> --- a/include/linux/lightnvm.h
>>> +++ b/include/linux/lightnvm.h
>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>> 	char name[DISK_NAME_LEN];
>>> 	void *private_data;
>>>
>>> +	int node;
>>> +
>>> 	void *rmap;
>>>
>>> 	struct mutex mlock;
>>> --
>>> 1.8.3.1
>>
>> We have a per-CPU semaphore that only allows to send a single I/O in
>> order to prevent write pointer violations. Are you seeing this error, or
>> is it theoretical?
>>
>> Javier
> 
> I meant per PU (parallel unit)…
> 

You are right, for current upstream implementation it's theoretical. I 
saw it after some experimental changes in lightnvm layer.

But I see one more potential benefit from this patch (correct me if I'm 
wrong), after this patch we will be sure that write thread is working on 
cpu from right numa node for device. Now I think, we can also take under 
consideration binding to whole set of cpus from right numa node instead 
of particular one.

Marcin
Hans Holmberg March 18, 2019, 12:22 p.m. UTC | #4
On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
<marcin.dziegielewski@intel.com> wrote:
>
> On 3/14/19 3:22 PM, Javier González wrote:
> >
> >> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
> >>
> >>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
> >>>
> >>> In some cases write thread migration between cpus can cause
> >>> sending writes in improper order and in consequence a lot of
> >>> errors from device.
> >>>
> >>> Write thread affinity to particular cpu prevent before it.
> >>>
> >>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
> >>> ---
> >>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
> >>> drivers/lightnvm/pblk.h      |  1 +
> >>> drivers/nvme/host/lightnvm.c |  1 +
> >>> include/linux/lightnvm.h     |  2 ++
> >>> 4 files changed, 33 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> >>> index 81e8ed4..bd25004 100644
> >>> --- a/drivers/lightnvm/pblk-init.c
> >>> +++ b/drivers/lightnvm/pblk-init.c
> >>> @@ -21,6 +21,8 @@
> >>>
> >>> #include "pblk.h"
> >>> #include "pblk-trace.h"
> >>> +#include <linux/cpumask.h>
> >>> +#include <linux/numa.h>
> >>>
> >>> static unsigned int write_buffer_size;
> >>>
> >>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
> >>>
> >>> struct bio_set pblk_bio_set;
> >>>
> >>> +cpumask_t free_cpumask;
> >>> +
> >>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
> >>>                       struct bio *bio)
> >>> {
> >>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
> >>>
> >>> static int pblk_writer_init(struct pblk *pblk)
> >>> {
> >>> +   cpumask_t tmp_cpumask, cpumask;
> >>> +   int cpu;
> >>> +
> >>>     pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
> >>>     if (IS_ERR(pblk->writer_ts)) {
> >>>             int err = PTR_ERR(pblk->writer_ts);
> >>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
> >>>             return err;
> >>>     }
> >>>
> >>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
> >>> +                   cpu_online_mask);
> >>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> >>> +
> >>> +   if (!cpumask_weight(&free_cpumask)) {
> >>> +           free_cpumask = CPU_MASK_ALL;
> >>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> >>> +   }
> >>> +
> >>> +   cpu = cpumask_last(&cpumask);
> >>> +
> >>> +   kthread_bind(pblk->writer_ts, cpu);
> >>> +
> >>> +   cpumask_clear_cpu(cpu, &free_cpumask);
> >>> +   pblk->writer_cpu = cpu;
> >>> +
> >>>     timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
> >>>     mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
> >>>
> >>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
> >>>                     "Stopping not fully synced write buffer\n");
> >>>
> >>>     del_timer_sync(&pblk->wtimer);
> >>> -   if (pblk->writer_ts)
> >>> +   if (pblk->writer_ts) {
> >>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
> >>>             kthread_stop(pblk->writer_ts);
> >>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
> >>> +   }
> >>> }
> >>>
> >>> static void pblk_free(struct pblk *pblk)
> >>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
> >>> {
> >>>     int ret;
> >>>
> >>> +   free_cpumask = CPU_MASK_ALL;
> >>> +
> >>>     ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
> >>>     if (ret)
> >>>             return ret;
> >>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> >>> index 381f074..650f983 100644
> >>> --- a/drivers/lightnvm/pblk.h
> >>> +++ b/drivers/lightnvm/pblk.h
> >>> @@ -690,6 +690,7 @@ struct pblk {
> >>>     atomic_t inflight_io;           /* General inflight I/O counter */
> >>>
> >>>     struct task_struct *writer_ts;
> >>> +   int writer_cpu;
> >>>
> >>>     /* Simple translation map of logical addresses to physical addresses.
> >>>      * The logical addresses is known by the host system, while the physical
> >>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
> >>> index 949e29e..971a19f 100644
> >>> --- a/drivers/nvme/host/lightnvm.c
> >>> +++ b/drivers/nvme/host/lightnvm.c
> >>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
> >>>     memcpy(dev->name, disk_name, DISK_NAME_LEN);
> >>>     dev->ops = &nvme_nvm_dev_ops;
> >>>     dev->private_data = ns;
> >>> +   dev->node = node;
> >>>     ns->ndev = dev;
> >>>
> >>>     return nvm_register(dev);
> >>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
> >>> index 5d865a5..312029e 100644
> >>> --- a/include/linux/lightnvm.h
> >>> +++ b/include/linux/lightnvm.h
> >>> @@ -427,6 +427,8 @@ struct nvm_dev {
> >>>     char name[DISK_NAME_LEN];
> >>>     void *private_data;
> >>>
> >>> +   int node;
> >>> +
> >>>     void *rmap;
> >>>
> >>>     struct mutex mlock;
> >>> --
> >>> 1.8.3.1
> >>
> >> We have a per-CPU semaphore that only allows to send a single I/O in
> >> order to prevent write pointer violations. Are you seeing this error, or
> >> is it theoretical?
> >>
> >> Javier
> >
> > I meant per PU (parallel unit)…
> >
>
> You are right, for current upstream implementation it's theoretical. I
> saw it after some experimental changes in lightnvm layer.
>
> But I see one more potential benefit from this patch (correct me if I'm
> wrong), after this patch we will be sure that write thread is working on
> cpu from right numa node for device. Now I think, we can also take under
> consideration binding to whole set of cpus from right numa node instead
> of particular one.

If this is just a potential benefit, I'd rather not do this, at least
not until we have numbers on any performance benefits and an analysis
of any trade offs.

Thanks,
Hans
>
> Marcin
Javier González March 18, 2019, 6:22 p.m. UTC | #5
> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote:
> 
> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
> <marcin.dziegielewski@intel.com> wrote:
>> On 3/14/19 3:22 PM, Javier González wrote:
>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>>> 
>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>>> 
>>>>> In some cases write thread migration between cpus can cause
>>>>> sending writes in improper order and in consequence a lot of
>>>>> errors from device.
>>>>> 
>>>>> Write thread affinity to particular cpu prevent before it.
>>>>> 
>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>>>> ---
>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>>>> drivers/lightnvm/pblk.h      |  1 +
>>>>> drivers/nvme/host/lightnvm.c |  1 +
>>>>> include/linux/lightnvm.h     |  2 ++
>>>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>>> 
>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>>>> index 81e8ed4..bd25004 100644
>>>>> --- a/drivers/lightnvm/pblk-init.c
>>>>> +++ b/drivers/lightnvm/pblk-init.c
>>>>> @@ -21,6 +21,8 @@
>>>>> 
>>>>> #include "pblk.h"
>>>>> #include "pblk-trace.h"
>>>>> +#include <linux/cpumask.h>
>>>>> +#include <linux/numa.h>
>>>>> 
>>>>> static unsigned int write_buffer_size;
>>>>> 
>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>>> 
>>>>> struct bio_set pblk_bio_set;
>>>>> 
>>>>> +cpumask_t free_cpumask;
>>>>> +
>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>>>>                      struct bio *bio)
>>>>> {
>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>>> 
>>>>> static int pblk_writer_init(struct pblk *pblk)
>>>>> {
>>>>> +   cpumask_t tmp_cpumask, cpumask;
>>>>> +   int cpu;
>>>>> +
>>>>>    pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>>>>    if (IS_ERR(pblk->writer_ts)) {
>>>>>            int err = PTR_ERR(pblk->writer_ts);
>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>>>>            return err;
>>>>>    }
>>>>> 
>>>>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>>>> +                   cpu_online_mask);
>>>>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>> +
>>>>> +   if (!cpumask_weight(&free_cpumask)) {
>>>>> +           free_cpumask = CPU_MASK_ALL;
>>>>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>> +   }
>>>>> +
>>>>> +   cpu = cpumask_last(&cpumask);
>>>>> +
>>>>> +   kthread_bind(pblk->writer_ts, cpu);
>>>>> +
>>>>> +   cpumask_clear_cpu(cpu, &free_cpumask);
>>>>> +   pblk->writer_cpu = cpu;
>>>>> +
>>>>>    timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>>>>    mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>>> 
>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>>>>                    "Stopping not fully synced write buffer\n");
>>>>> 
>>>>>    del_timer_sync(&pblk->wtimer);
>>>>> -   if (pblk->writer_ts)
>>>>> +   if (pblk->writer_ts) {
>>>>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>>>>            kthread_stop(pblk->writer_ts);
>>>>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>>>> +   }
>>>>> }
>>>>> 
>>>>> static void pblk_free(struct pblk *pblk)
>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>>>> {
>>>>>    int ret;
>>>>> 
>>>>> +   free_cpumask = CPU_MASK_ALL;
>>>>> +
>>>>>    ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>>>>    if (ret)
>>>>>            return ret;
>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>>>> index 381f074..650f983 100644
>>>>> --- a/drivers/lightnvm/pblk.h
>>>>> +++ b/drivers/lightnvm/pblk.h
>>>>> @@ -690,6 +690,7 @@ struct pblk {
>>>>>    atomic_t inflight_io;           /* General inflight I/O counter */
>>>>> 
>>>>>    struct task_struct *writer_ts;
>>>>> +   int writer_cpu;
>>>>> 
>>>>>    /* Simple translation map of logical addresses to physical addresses.
>>>>>     * The logical addresses is known by the host system, while the physical
>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>>>> index 949e29e..971a19f 100644
>>>>> --- a/drivers/nvme/host/lightnvm.c
>>>>> +++ b/drivers/nvme/host/lightnvm.c
>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>>>>    memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>>>>    dev->ops = &nvme_nvm_dev_ops;
>>>>>    dev->private_data = ns;
>>>>> +   dev->node = node;
>>>>>    ns->ndev = dev;
>>>>> 
>>>>>    return nvm_register(dev);
>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>>>> index 5d865a5..312029e 100644
>>>>> --- a/include/linux/lightnvm.h
>>>>> +++ b/include/linux/lightnvm.h
>>>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>>>>    char name[DISK_NAME_LEN];
>>>>>    void *private_data;
>>>>> 
>>>>> +   int node;
>>>>> +
>>>>>    void *rmap;
>>>>> 
>>>>>    struct mutex mlock;
>>>>> --
>>>>> 1.8.3.1
>>>> 
>>>> We have a per-CPU semaphore that only allows to send a single I/O in
>>>> order to prevent write pointer violations. Are you seeing this error, or
>>>> is it theoretical?
>>>> 
>>>> Javier
>>> 
>>> I meant per PU (parallel unit)…
>> 
>> You are right, for current upstream implementation it's theoretical. I
>> saw it after some experimental changes in lightnvm layer.
>> 
>> But I see one more potential benefit from this patch (correct me if I'm
>> wrong), after this patch we will be sure that write thread is working on
>> cpu from right numa node for device. Now I think, we can also take under
>> consideration binding to whole set of cpus from right numa node instead
>> of particular one.

I can see what you mean, but I think we are better relying on the
scheduler balancing the NUMA nodes rather than pinning the write thread
to specific CPUs. Are we doing something like this in the writeback
path?

> 
> If this is just a potential benefit, I'd rather not do this, at least
> not until we have numbers on any performance benefits and an analysis
> of any trade offs.
> 
> Thanks,
> Hans
>> Marcin
Marcin Dziegielewski March 19, 2019, 1:33 p.m. UTC | #6
On 3/18/19 7:22 PM, Javier González wrote:
> 
>> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote:
>>
>> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
>> <marcin.dziegielewski@intel.com> wrote:
>>> On 3/14/19 3:22 PM, Javier González wrote:
>>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>>>>
>>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>>>>
>>>>>> In some cases write thread migration between cpus can cause
>>>>>> sending writes in improper order and in consequence a lot of
>>>>>> errors from device.
>>>>>>
>>>>>> Write thread affinity to particular cpu prevent before it.
>>>>>>
>>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>>>>> ---
>>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>>>>> drivers/lightnvm/pblk.h      |  1 +
>>>>>> drivers/nvme/host/lightnvm.c |  1 +
>>>>>> include/linux/lightnvm.h     |  2 ++
>>>>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>>>>> index 81e8ed4..bd25004 100644
>>>>>> --- a/drivers/lightnvm/pblk-init.c
>>>>>> +++ b/drivers/lightnvm/pblk-init.c
>>>>>> @@ -21,6 +21,8 @@
>>>>>>
>>>>>> #include "pblk.h"
>>>>>> #include "pblk-trace.h"
>>>>>> +#include <linux/cpumask.h>
>>>>>> +#include <linux/numa.h>
>>>>>>
>>>>>> static unsigned int write_buffer_size;
>>>>>>
>>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>>>>
>>>>>> struct bio_set pblk_bio_set;
>>>>>>
>>>>>> +cpumask_t free_cpumask;
>>>>>> +
>>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>>>>>                       struct bio *bio)
>>>>>> {
>>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>>>>
>>>>>> static int pblk_writer_init(struct pblk *pblk)
>>>>>> {
>>>>>> +   cpumask_t tmp_cpumask, cpumask;
>>>>>> +   int cpu;
>>>>>> +
>>>>>>     pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>>>>>     if (IS_ERR(pblk->writer_ts)) {
>>>>>>             int err = PTR_ERR(pblk->writer_ts);
>>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>>>>>             return err;
>>>>>>     }
>>>>>>
>>>>>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>>>>> +                   cpu_online_mask);
>>>>>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>> +
>>>>>> +   if (!cpumask_weight(&free_cpumask)) {
>>>>>> +           free_cpumask = CPU_MASK_ALL;
>>>>>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>> +   }
>>>>>> +
>>>>>> +   cpu = cpumask_last(&cpumask);
>>>>>> +
>>>>>> +   kthread_bind(pblk->writer_ts, cpu);
>>>>>> +
>>>>>> +   cpumask_clear_cpu(cpu, &free_cpumask);
>>>>>> +   pblk->writer_cpu = cpu;
>>>>>> +
>>>>>>     timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>>>>>     mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>>>>
>>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>>>>>                     "Stopping not fully synced write buffer\n");
>>>>>>
>>>>>>     del_timer_sync(&pblk->wtimer);
>>>>>> -   if (pblk->writer_ts)
>>>>>> +   if (pblk->writer_ts) {
>>>>>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>>>>>             kthread_stop(pblk->writer_ts);
>>>>>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>>>>> +   }
>>>>>> }
>>>>>>
>>>>>> static void pblk_free(struct pblk *pblk)
>>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>>>>> {
>>>>>>     int ret;
>>>>>>
>>>>>> +   free_cpumask = CPU_MASK_ALL;
>>>>>> +
>>>>>>     ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>>>>>     if (ret)
>>>>>>             return ret;
>>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>>>>> index 381f074..650f983 100644
>>>>>> --- a/drivers/lightnvm/pblk.h
>>>>>> +++ b/drivers/lightnvm/pblk.h
>>>>>> @@ -690,6 +690,7 @@ struct pblk {
>>>>>>     atomic_t inflight_io;           /* General inflight I/O counter */
>>>>>>
>>>>>>     struct task_struct *writer_ts;
>>>>>> +   int writer_cpu;
>>>>>>
>>>>>>     /* Simple translation map of logical addresses to physical addresses.
>>>>>>      * The logical addresses is known by the host system, while the physical
>>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>>>>> index 949e29e..971a19f 100644
>>>>>> --- a/drivers/nvme/host/lightnvm.c
>>>>>> +++ b/drivers/nvme/host/lightnvm.c
>>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>>>>>     memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>>>>>     dev->ops = &nvme_nvm_dev_ops;
>>>>>>     dev->private_data = ns;
>>>>>> +   dev->node = node;
>>>>>>     ns->ndev = dev;
>>>>>>
>>>>>>     return nvm_register(dev);
>>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>>>>> index 5d865a5..312029e 100644
>>>>>> --- a/include/linux/lightnvm.h
>>>>>> +++ b/include/linux/lightnvm.h
>>>>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>>>>>     char name[DISK_NAME_LEN];
>>>>>>     void *private_data;
>>>>>>
>>>>>> +   int node;
>>>>>> +
>>>>>>     void *rmap;
>>>>>>
>>>>>>     struct mutex mlock;
>>>>>> --
>>>>>> 1.8.3.1
>>>>>
>>>>> We have a per-CPU semaphore that only allows to send a single I/O in
>>>>> order to prevent write pointer violations. Are you seeing this error, or
>>>>> is it theoretical?
>>>>>
>>>>> Javier
>>>>
>>>> I meant per PU (parallel unit)…
>>>
>>> You are right, for current upstream implementation it's theoretical. I
>>> saw it after some experimental changes in lightnvm layer.
>>>
>>> But I see one more potential benefit from this patch (correct me if I'm
>>> wrong), after this patch we will be sure that write thread is working on
>>> cpu from right numa node for device. Now I think, we can also take under
>>> consideration binding to whole set of cpus from right numa node instead
>>> of particular one.
> 
> I can see what you mean, but I think we are better relying on the
> scheduler balancing the NUMA nodes rather than pinning the write thread
> to specific CPUs. Are we doing something like this in the writeback
> path?
> 
I have checked wrtieback path and I didn't find nothing similar, so 
please disregard this patch.

Marcin

>>
>> If this is just a potential benefit, I'd rather not do this, at least
>> not until we have numbers on any performance benefits and an analysis
>> of any trade offs.
>>
>> Thanks,
>> Hans
>>> Marcin
Javier González March 19, 2019, 2:25 p.m. UTC | #7
> On 19 Mar 2019, at 14.33, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
> 
> 
> 
> On 3/18/19 7:22 PM, Javier González wrote:
>>> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote:
>>> 
>>> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
>>> <marcin.dziegielewski@intel.com> wrote:
>>>> On 3/14/19 3:22 PM, Javier González wrote:
>>>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>>>>> 
>>>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>>>>> 
>>>>>>> In some cases write thread migration between cpus can cause
>>>>>>> sending writes in improper order and in consequence a lot of
>>>>>>> errors from device.
>>>>>>> 
>>>>>>> Write thread affinity to particular cpu prevent before it.
>>>>>>> 
>>>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>>>>>> ---
>>>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>>>>>> drivers/lightnvm/pblk.h      |  1 +
>>>>>>> drivers/nvme/host/lightnvm.c |  1 +
>>>>>>> include/linux/lightnvm.h     |  2 ++
>>>>>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>>>>> 
>>>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>>>>>> index 81e8ed4..bd25004 100644
>>>>>>> --- a/drivers/lightnvm/pblk-init.c
>>>>>>> +++ b/drivers/lightnvm/pblk-init.c
>>>>>>> @@ -21,6 +21,8 @@
>>>>>>> 
>>>>>>> #include "pblk.h"
>>>>>>> #include "pblk-trace.h"
>>>>>>> +#include <linux/cpumask.h>
>>>>>>> +#include <linux/numa.h>
>>>>>>> 
>>>>>>> static unsigned int write_buffer_size;
>>>>>>> 
>>>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>>>>> 
>>>>>>> struct bio_set pblk_bio_set;
>>>>>>> 
>>>>>>> +cpumask_t free_cpumask;
>>>>>>> +
>>>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>>>>>>                      struct bio *bio)
>>>>>>> {
>>>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>>>>> 
>>>>>>> static int pblk_writer_init(struct pblk *pblk)
>>>>>>> {
>>>>>>> +   cpumask_t tmp_cpumask, cpumask;
>>>>>>> +   int cpu;
>>>>>>> +
>>>>>>>    pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>>>>>>    if (IS_ERR(pblk->writer_ts)) {
>>>>>>>            int err = PTR_ERR(pblk->writer_ts);
>>>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>>>>>>            return err;
>>>>>>>    }
>>>>>>> 
>>>>>>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>>>>>> +                   cpu_online_mask);
>>>>>>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>>> +
>>>>>>> +   if (!cpumask_weight(&free_cpumask)) {
>>>>>>> +           free_cpumask = CPU_MASK_ALL;
>>>>>>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>>> +   }
>>>>>>> +
>>>>>>> +   cpu = cpumask_last(&cpumask);
>>>>>>> +
>>>>>>> +   kthread_bind(pblk->writer_ts, cpu);
>>>>>>> +
>>>>>>> +   cpumask_clear_cpu(cpu, &free_cpumask);
>>>>>>> +   pblk->writer_cpu = cpu;
>>>>>>> +
>>>>>>>    timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>>>>>>    mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>>>>> 
>>>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>>>>>>                    "Stopping not fully synced write buffer\n");
>>>>>>> 
>>>>>>>    del_timer_sync(&pblk->wtimer);
>>>>>>> -   if (pblk->writer_ts)
>>>>>>> +   if (pblk->writer_ts) {
>>>>>>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>>>>>>            kthread_stop(pblk->writer_ts);
>>>>>>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>>>>>> +   }
>>>>>>> }
>>>>>>> 
>>>>>>> static void pblk_free(struct pblk *pblk)
>>>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>>>>>> {
>>>>>>>    int ret;
>>>>>>> 
>>>>>>> +   free_cpumask = CPU_MASK_ALL;
>>>>>>> +
>>>>>>>    ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>>>>>>    if (ret)
>>>>>>>            return ret;
>>>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>>>>>> index 381f074..650f983 100644
>>>>>>> --- a/drivers/lightnvm/pblk.h
>>>>>>> +++ b/drivers/lightnvm/pblk.h
>>>>>>> @@ -690,6 +690,7 @@ struct pblk {
>>>>>>>    atomic_t inflight_io;           /* General inflight I/O counter */
>>>>>>> 
>>>>>>>    struct task_struct *writer_ts;
>>>>>>> +   int writer_cpu;
>>>>>>> 
>>>>>>>    /* Simple translation map of logical addresses to physical addresses.
>>>>>>>     * The logical addresses is known by the host system, while the physical
>>>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>>>>>> index 949e29e..971a19f 100644
>>>>>>> --- a/drivers/nvme/host/lightnvm.c
>>>>>>> +++ b/drivers/nvme/host/lightnvm.c
>>>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>>>>>>    memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>>>>>>    dev->ops = &nvme_nvm_dev_ops;
>>>>>>>    dev->private_data = ns;
>>>>>>> +   dev->node = node;
>>>>>>>    ns->ndev = dev;
>>>>>>> 
>>>>>>>    return nvm_register(dev);
>>>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>>>>>> index 5d865a5..312029e 100644
>>>>>>> --- a/include/linux/lightnvm.h
>>>>>>> +++ b/include/linux/lightnvm.h
>>>>>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>>>>>>    char name[DISK_NAME_LEN];
>>>>>>>    void *private_data;
>>>>>>> 
>>>>>>> +   int node;
>>>>>>> +
>>>>>>>    void *rmap;
>>>>>>> 
>>>>>>>    struct mutex mlock;
>>>>>>> --
>>>>>>> 1.8.3.1
>>>>>> 
>>>>>> We have a per-CPU semaphore that only allows to send a single I/O in
>>>>>> order to prevent write pointer violations. Are you seeing this error, or
>>>>>> is it theoretical?
>>>>>> 
>>>>>> Javier
>>>>> 
>>>>> I meant per PU (parallel unit)…
>>>> 
>>>> You are right, for current upstream implementation it's theoretical. I
>>>> saw it after some experimental changes in lightnvm layer.
>>>> 
>>>> But I see one more potential benefit from this patch (correct me if I'm
>>>> wrong), after this patch we will be sure that write thread is working on
>>>> cpu from right numa node for device. Now I think, we can also take under
>>>> consideration binding to whole set of cpus from right numa node instead
>>>> of particular one.
>> I can see what you mean, but I think we are better relying on the
>> scheduler balancing the NUMA nodes rather than pinning the write thread
>> to specific CPUs. Are we doing something like this in the writeback
>> path?
> I have checked wrtieback path and I didn't find nothing similar, so please disregard this patch.
> 
> Marcin
> 

Don’t get me wrong. I think it is a good motivation, but maybe it should
be implemented at a different level.

Thanks,
Javier
diff mbox series

Patch

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 81e8ed4..bd25004 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -21,6 +21,8 @@ 
 
 #include "pblk.h"
 #include "pblk-trace.h"
+#include <linux/cpumask.h>
+#include <linux/numa.h>
 
 static unsigned int write_buffer_size;
 
@@ -47,6 +49,8 @@  struct pblk_global_caches {
 
 struct bio_set pblk_bio_set;
 
+cpumask_t free_cpumask;
+
 static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
 			  struct bio *bio)
 {
@@ -1098,6 +1102,9 @@  static int pblk_lines_init(struct pblk *pblk)
 
 static int pblk_writer_init(struct pblk *pblk)
 {
+	cpumask_t tmp_cpumask, cpumask;
+	int cpu;
+
 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
 	if (IS_ERR(pblk->writer_ts)) {
 		int err = PTR_ERR(pblk->writer_ts);
@@ -1108,6 +1115,22 @@  static int pblk_writer_init(struct pblk *pblk)
 		return err;
 	}
 
+	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
+			cpu_online_mask);
+	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
+
+	if (!cpumask_weight(&free_cpumask)) {
+		free_cpumask = CPU_MASK_ALL;
+		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
+	}
+
+	cpu = cpumask_last(&cpumask);
+
+	kthread_bind(pblk->writer_ts, cpu);
+
+	cpumask_clear_cpu(cpu, &free_cpumask);
+	pblk->writer_cpu = cpu;
+
 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
 
@@ -1126,8 +1149,11 @@  static void pblk_writer_stop(struct pblk *pblk)
 			"Stopping not fully synced write buffer\n");
 
 	del_timer_sync(&pblk->wtimer);
-	if (pblk->writer_ts)
+	if (pblk->writer_ts) {
+		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
 		kthread_stop(pblk->writer_ts);
+		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
+	}
 }
 
 static void pblk_free(struct pblk *pblk)
@@ -1328,6 +1354,8 @@  static int __init pblk_module_init(void)
 {
 	int ret;
 
+	free_cpumask = CPU_MASK_ALL;
+
 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
 	if (ret)
 		return ret;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 381f074..650f983 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -690,6 +690,7 @@  struct pblk {
 	atomic_t inflight_io;		/* General inflight I/O counter */
 
 	struct task_struct *writer_ts;
+	int writer_cpu;
 
 	/* Simple translation map of logical addresses to physical addresses.
 	 * The logical addresses is known by the host system, while the physical
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 949e29e..971a19f 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -982,6 +982,7 @@  int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
 	dev->ops = &nvme_nvm_dev_ops;
 	dev->private_data = ns;
+	dev->node = node;
 	ns->ndev = dev;
 
 	return nvm_register(dev);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 5d865a5..312029e 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -427,6 +427,8 @@  struct nvm_dev {
 	char name[DISK_NAME_LEN];
 	void *private_data;
 
+	int node;
+
 	void *rmap;
 
 	struct mutex mlock;