Message ID | 1552573079-12712-1-git-send-email-marcin.dziegielewski@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | lightnvm: pblk: set write thread affinity to particular cpu | expand |
> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: > > In some cases write thread migration between cpus can cause > sending writes in improper order and in consequence a lot of > errors from device. > > Write thread affinity to particular cpu prevent before it. > > Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> > --- > drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- > drivers/lightnvm/pblk.h | 1 + > drivers/nvme/host/lightnvm.c | 1 + > include/linux/lightnvm.h | 2 ++ > 4 files changed, 33 insertions(+), 1 deletion(-) > > diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c > index 81e8ed4..bd25004 100644 > --- a/drivers/lightnvm/pblk-init.c > +++ b/drivers/lightnvm/pblk-init.c > @@ -21,6 +21,8 @@ > > #include "pblk.h" > #include "pblk-trace.h" > +#include <linux/cpumask.h> > +#include <linux/numa.h> > > static unsigned int write_buffer_size; > > @@ -47,6 +49,8 @@ struct pblk_global_caches { > > struct bio_set pblk_bio_set; > > +cpumask_t free_cpumask; > + > static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, > struct bio *bio) > { > @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) > > static int pblk_writer_init(struct pblk *pblk) > { > + cpumask_t tmp_cpumask, cpumask; > + int cpu; > + > pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); > if (IS_ERR(pblk->writer_ts)) { > int err = PTR_ERR(pblk->writer_ts); > @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) > return err; > } > > + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), > + cpu_online_mask); > + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); > + > + if (!cpumask_weight(&free_cpumask)) { > + free_cpumask = CPU_MASK_ALL; > + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); > + } > + > + cpu = cpumask_last(&cpumask); > + > + kthread_bind(pblk->writer_ts, cpu); > + > + cpumask_clear_cpu(cpu, &free_cpumask); > + pblk->writer_cpu = cpu; > + > timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); > mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); > > @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) > "Stopping not fully synced write buffer\n"); > > del_timer_sync(&pblk->wtimer); > - if (pblk->writer_ts) > + if (pblk->writer_ts) { > + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); > kthread_stop(pblk->writer_ts); > + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); > + } > } > > static void pblk_free(struct pblk *pblk) > @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) > { > int ret; > > + free_cpumask = CPU_MASK_ALL; > + > ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); > if (ret) > return ret; > diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h > index 381f074..650f983 100644 > --- a/drivers/lightnvm/pblk.h > +++ b/drivers/lightnvm/pblk.h > @@ -690,6 +690,7 @@ struct pblk { > atomic_t inflight_io; /* General inflight I/O counter */ > > struct task_struct *writer_ts; > + int writer_cpu; > > /* Simple translation map of logical addresses to physical addresses. > * The logical addresses is known by the host system, while the physical > diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c > index 949e29e..971a19f 100644 > --- a/drivers/nvme/host/lightnvm.c > +++ b/drivers/nvme/host/lightnvm.c > @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) > memcpy(dev->name, disk_name, DISK_NAME_LEN); > dev->ops = &nvme_nvm_dev_ops; > dev->private_data = ns; > + dev->node = node; > ns->ndev = dev; > > return nvm_register(dev); > diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h > index 5d865a5..312029e 100644 > --- a/include/linux/lightnvm.h > +++ b/include/linux/lightnvm.h > @@ -427,6 +427,8 @@ struct nvm_dev { > char name[DISK_NAME_LEN]; > void *private_data; > > + int node; > + > void *rmap; > > struct mutex mlock; > -- > 1.8.3.1 We have a per-CPU semaphore that only allows to send a single I/O in order to prevent write pointer violations. Are you seeing this error, or is it theoretical? Javier
> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote: > >> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: >> >> In some cases write thread migration between cpus can cause >> sending writes in improper order and in consequence a lot of >> errors from device. >> >> Write thread affinity to particular cpu prevent before it. >> >> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> >> --- >> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- >> drivers/lightnvm/pblk.h | 1 + >> drivers/nvme/host/lightnvm.c | 1 + >> include/linux/lightnvm.h | 2 ++ >> 4 files changed, 33 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >> index 81e8ed4..bd25004 100644 >> --- a/drivers/lightnvm/pblk-init.c >> +++ b/drivers/lightnvm/pblk-init.c >> @@ -21,6 +21,8 @@ >> >> #include "pblk.h" >> #include "pblk-trace.h" >> +#include <linux/cpumask.h> >> +#include <linux/numa.h> >> >> static unsigned int write_buffer_size; >> >> @@ -47,6 +49,8 @@ struct pblk_global_caches { >> >> struct bio_set pblk_bio_set; >> >> +cpumask_t free_cpumask; >> + >> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, >> struct bio *bio) >> { >> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) >> >> static int pblk_writer_init(struct pblk *pblk) >> { >> + cpumask_t tmp_cpumask, cpumask; >> + int cpu; >> + >> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); >> if (IS_ERR(pblk->writer_ts)) { >> int err = PTR_ERR(pblk->writer_ts); >> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) >> return err; >> } >> >> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), >> + cpu_online_mask); >> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >> + >> + if (!cpumask_weight(&free_cpumask)) { >> + free_cpumask = CPU_MASK_ALL; >> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >> + } >> + >> + cpu = cpumask_last(&cpumask); >> + >> + kthread_bind(pblk->writer_ts, cpu); >> + >> + cpumask_clear_cpu(cpu, &free_cpumask); >> + pblk->writer_cpu = cpu; >> + >> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); >> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); >> >> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) >> "Stopping not fully synced write buffer\n"); >> >> del_timer_sync(&pblk->wtimer); >> - if (pblk->writer_ts) >> + if (pblk->writer_ts) { >> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); >> kthread_stop(pblk->writer_ts); >> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); >> + } >> } >> >> static void pblk_free(struct pblk *pblk) >> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) >> { >> int ret; >> >> + free_cpumask = CPU_MASK_ALL; >> + >> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); >> if (ret) >> return ret; >> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h >> index 381f074..650f983 100644 >> --- a/drivers/lightnvm/pblk.h >> +++ b/drivers/lightnvm/pblk.h >> @@ -690,6 +690,7 @@ struct pblk { >> atomic_t inflight_io; /* General inflight I/O counter */ >> >> struct task_struct *writer_ts; >> + int writer_cpu; >> >> /* Simple translation map of logical addresses to physical addresses. >> * The logical addresses is known by the host system, while the physical >> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c >> index 949e29e..971a19f 100644 >> --- a/drivers/nvme/host/lightnvm.c >> +++ b/drivers/nvme/host/lightnvm.c >> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) >> memcpy(dev->name, disk_name, DISK_NAME_LEN); >> dev->ops = &nvme_nvm_dev_ops; >> dev->private_data = ns; >> + dev->node = node; >> ns->ndev = dev; >> >> return nvm_register(dev); >> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h >> index 5d865a5..312029e 100644 >> --- a/include/linux/lightnvm.h >> +++ b/include/linux/lightnvm.h >> @@ -427,6 +427,8 @@ struct nvm_dev { >> char name[DISK_NAME_LEN]; >> void *private_data; >> >> + int node; >> + >> void *rmap; >> >> struct mutex mlock; >> -- >> 1.8.3.1 > > We have a per-CPU semaphore that only allows to send a single I/O in > order to prevent write pointer violations. Are you seeing this error, or > is it theoretical? > > Javier I meant per PU (parallel unit)…
On 3/14/19 3:22 PM, Javier González wrote: > >> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote: >> >>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: >>> >>> In some cases write thread migration between cpus can cause >>> sending writes in improper order and in consequence a lot of >>> errors from device. >>> >>> Write thread affinity to particular cpu prevent before it. >>> >>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> >>> --- >>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- >>> drivers/lightnvm/pblk.h | 1 + >>> drivers/nvme/host/lightnvm.c | 1 + >>> include/linux/lightnvm.h | 2 ++ >>> 4 files changed, 33 insertions(+), 1 deletion(-) >>> >>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >>> index 81e8ed4..bd25004 100644 >>> --- a/drivers/lightnvm/pblk-init.c >>> +++ b/drivers/lightnvm/pblk-init.c >>> @@ -21,6 +21,8 @@ >>> >>> #include "pblk.h" >>> #include "pblk-trace.h" >>> +#include <linux/cpumask.h> >>> +#include <linux/numa.h> >>> >>> static unsigned int write_buffer_size; >>> >>> @@ -47,6 +49,8 @@ struct pblk_global_caches { >>> >>> struct bio_set pblk_bio_set; >>> >>> +cpumask_t free_cpumask; >>> + >>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, >>> struct bio *bio) >>> { >>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) >>> >>> static int pblk_writer_init(struct pblk *pblk) >>> { >>> + cpumask_t tmp_cpumask, cpumask; >>> + int cpu; >>> + >>> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); >>> if (IS_ERR(pblk->writer_ts)) { >>> int err = PTR_ERR(pblk->writer_ts); >>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) >>> return err; >>> } >>> >>> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), >>> + cpu_online_mask); >>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>> + >>> + if (!cpumask_weight(&free_cpumask)) { >>> + free_cpumask = CPU_MASK_ALL; >>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>> + } >>> + >>> + cpu = cpumask_last(&cpumask); >>> + >>> + kthread_bind(pblk->writer_ts, cpu); >>> + >>> + cpumask_clear_cpu(cpu, &free_cpumask); >>> + pblk->writer_cpu = cpu; >>> + >>> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); >>> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); >>> >>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) >>> "Stopping not fully synced write buffer\n"); >>> >>> del_timer_sync(&pblk->wtimer); >>> - if (pblk->writer_ts) >>> + if (pblk->writer_ts) { >>> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); >>> kthread_stop(pblk->writer_ts); >>> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); >>> + } >>> } >>> >>> static void pblk_free(struct pblk *pblk) >>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) >>> { >>> int ret; >>> >>> + free_cpumask = CPU_MASK_ALL; >>> + >>> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); >>> if (ret) >>> return ret; >>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h >>> index 381f074..650f983 100644 >>> --- a/drivers/lightnvm/pblk.h >>> +++ b/drivers/lightnvm/pblk.h >>> @@ -690,6 +690,7 @@ struct pblk { >>> atomic_t inflight_io; /* General inflight I/O counter */ >>> >>> struct task_struct *writer_ts; >>> + int writer_cpu; >>> >>> /* Simple translation map of logical addresses to physical addresses. >>> * The logical addresses is known by the host system, while the physical >>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c >>> index 949e29e..971a19f 100644 >>> --- a/drivers/nvme/host/lightnvm.c >>> +++ b/drivers/nvme/host/lightnvm.c >>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) >>> memcpy(dev->name, disk_name, DISK_NAME_LEN); >>> dev->ops = &nvme_nvm_dev_ops; >>> dev->private_data = ns; >>> + dev->node = node; >>> ns->ndev = dev; >>> >>> return nvm_register(dev); >>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h >>> index 5d865a5..312029e 100644 >>> --- a/include/linux/lightnvm.h >>> +++ b/include/linux/lightnvm.h >>> @@ -427,6 +427,8 @@ struct nvm_dev { >>> char name[DISK_NAME_LEN]; >>> void *private_data; >>> >>> + int node; >>> + >>> void *rmap; >>> >>> struct mutex mlock; >>> -- >>> 1.8.3.1 >> >> We have a per-CPU semaphore that only allows to send a single I/O in >> order to prevent write pointer violations. Are you seeing this error, or >> is it theoretical? >> >> Javier > > I meant per PU (parallel unit)… > You are right, for current upstream implementation it's theoretical. I saw it after some experimental changes in lightnvm layer. But I see one more potential benefit from this patch (correct me if I'm wrong), after this patch we will be sure that write thread is working on cpu from right numa node for device. Now I think, we can also take under consideration binding to whole set of cpus from right numa node instead of particular one. Marcin
On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: > > On 3/14/19 3:22 PM, Javier González wrote: > > > >> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote: > >> > >>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: > >>> > >>> In some cases write thread migration between cpus can cause > >>> sending writes in improper order and in consequence a lot of > >>> errors from device. > >>> > >>> Write thread affinity to particular cpu prevent before it. > >>> > >>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> > >>> --- > >>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- > >>> drivers/lightnvm/pblk.h | 1 + > >>> drivers/nvme/host/lightnvm.c | 1 + > >>> include/linux/lightnvm.h | 2 ++ > >>> 4 files changed, 33 insertions(+), 1 deletion(-) > >>> > >>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c > >>> index 81e8ed4..bd25004 100644 > >>> --- a/drivers/lightnvm/pblk-init.c > >>> +++ b/drivers/lightnvm/pblk-init.c > >>> @@ -21,6 +21,8 @@ > >>> > >>> #include "pblk.h" > >>> #include "pblk-trace.h" > >>> +#include <linux/cpumask.h> > >>> +#include <linux/numa.h> > >>> > >>> static unsigned int write_buffer_size; > >>> > >>> @@ -47,6 +49,8 @@ struct pblk_global_caches { > >>> > >>> struct bio_set pblk_bio_set; > >>> > >>> +cpumask_t free_cpumask; > >>> + > >>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, > >>> struct bio *bio) > >>> { > >>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) > >>> > >>> static int pblk_writer_init(struct pblk *pblk) > >>> { > >>> + cpumask_t tmp_cpumask, cpumask; > >>> + int cpu; > >>> + > >>> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); > >>> if (IS_ERR(pblk->writer_ts)) { > >>> int err = PTR_ERR(pblk->writer_ts); > >>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) > >>> return err; > >>> } > >>> > >>> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), > >>> + cpu_online_mask); > >>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); > >>> + > >>> + if (!cpumask_weight(&free_cpumask)) { > >>> + free_cpumask = CPU_MASK_ALL; > >>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); > >>> + } > >>> + > >>> + cpu = cpumask_last(&cpumask); > >>> + > >>> + kthread_bind(pblk->writer_ts, cpu); > >>> + > >>> + cpumask_clear_cpu(cpu, &free_cpumask); > >>> + pblk->writer_cpu = cpu; > >>> + > >>> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); > >>> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); > >>> > >>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) > >>> "Stopping not fully synced write buffer\n"); > >>> > >>> del_timer_sync(&pblk->wtimer); > >>> - if (pblk->writer_ts) > >>> + if (pblk->writer_ts) { > >>> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); > >>> kthread_stop(pblk->writer_ts); > >>> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); > >>> + } > >>> } > >>> > >>> static void pblk_free(struct pblk *pblk) > >>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) > >>> { > >>> int ret; > >>> > >>> + free_cpumask = CPU_MASK_ALL; > >>> + > >>> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); > >>> if (ret) > >>> return ret; > >>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h > >>> index 381f074..650f983 100644 > >>> --- a/drivers/lightnvm/pblk.h > >>> +++ b/drivers/lightnvm/pblk.h > >>> @@ -690,6 +690,7 @@ struct pblk { > >>> atomic_t inflight_io; /* General inflight I/O counter */ > >>> > >>> struct task_struct *writer_ts; > >>> + int writer_cpu; > >>> > >>> /* Simple translation map of logical addresses to physical addresses. > >>> * The logical addresses is known by the host system, while the physical > >>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c > >>> index 949e29e..971a19f 100644 > >>> --- a/drivers/nvme/host/lightnvm.c > >>> +++ b/drivers/nvme/host/lightnvm.c > >>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) > >>> memcpy(dev->name, disk_name, DISK_NAME_LEN); > >>> dev->ops = &nvme_nvm_dev_ops; > >>> dev->private_data = ns; > >>> + dev->node = node; > >>> ns->ndev = dev; > >>> > >>> return nvm_register(dev); > >>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h > >>> index 5d865a5..312029e 100644 > >>> --- a/include/linux/lightnvm.h > >>> +++ b/include/linux/lightnvm.h > >>> @@ -427,6 +427,8 @@ struct nvm_dev { > >>> char name[DISK_NAME_LEN]; > >>> void *private_data; > >>> > >>> + int node; > >>> + > >>> void *rmap; > >>> > >>> struct mutex mlock; > >>> -- > >>> 1.8.3.1 > >> > >> We have a per-CPU semaphore that only allows to send a single I/O in > >> order to prevent write pointer violations. Are you seeing this error, or > >> is it theoretical? > >> > >> Javier > > > > I meant per PU (parallel unit)… > > > > You are right, for current upstream implementation it's theoretical. I > saw it after some experimental changes in lightnvm layer. > > But I see one more potential benefit from this patch (correct me if I'm > wrong), after this patch we will be sure that write thread is working on > cpu from right numa node for device. Now I think, we can also take under > consideration binding to whole set of cpus from right numa node instead > of particular one. If this is just a potential benefit, I'd rather not do this, at least not until we have numbers on any performance benefits and an analysis of any trade offs. Thanks, Hans > > Marcin
> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote: > > On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski > <marcin.dziegielewski@intel.com> wrote: >> On 3/14/19 3:22 PM, Javier González wrote: >>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote: >>>> >>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: >>>>> >>>>> In some cases write thread migration between cpus can cause >>>>> sending writes in improper order and in consequence a lot of >>>>> errors from device. >>>>> >>>>> Write thread affinity to particular cpu prevent before it. >>>>> >>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> >>>>> --- >>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- >>>>> drivers/lightnvm/pblk.h | 1 + >>>>> drivers/nvme/host/lightnvm.c | 1 + >>>>> include/linux/lightnvm.h | 2 ++ >>>>> 4 files changed, 33 insertions(+), 1 deletion(-) >>>>> >>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >>>>> index 81e8ed4..bd25004 100644 >>>>> --- a/drivers/lightnvm/pblk-init.c >>>>> +++ b/drivers/lightnvm/pblk-init.c >>>>> @@ -21,6 +21,8 @@ >>>>> >>>>> #include "pblk.h" >>>>> #include "pblk-trace.h" >>>>> +#include <linux/cpumask.h> >>>>> +#include <linux/numa.h> >>>>> >>>>> static unsigned int write_buffer_size; >>>>> >>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches { >>>>> >>>>> struct bio_set pblk_bio_set; >>>>> >>>>> +cpumask_t free_cpumask; >>>>> + >>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, >>>>> struct bio *bio) >>>>> { >>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) >>>>> >>>>> static int pblk_writer_init(struct pblk *pblk) >>>>> { >>>>> + cpumask_t tmp_cpumask, cpumask; >>>>> + int cpu; >>>>> + >>>>> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); >>>>> if (IS_ERR(pblk->writer_ts)) { >>>>> int err = PTR_ERR(pblk->writer_ts); >>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) >>>>> return err; >>>>> } >>>>> >>>>> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), >>>>> + cpu_online_mask); >>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>> + >>>>> + if (!cpumask_weight(&free_cpumask)) { >>>>> + free_cpumask = CPU_MASK_ALL; >>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>> + } >>>>> + >>>>> + cpu = cpumask_last(&cpumask); >>>>> + >>>>> + kthread_bind(pblk->writer_ts, cpu); >>>>> + >>>>> + cpumask_clear_cpu(cpu, &free_cpumask); >>>>> + pblk->writer_cpu = cpu; >>>>> + >>>>> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); >>>>> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); >>>>> >>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) >>>>> "Stopping not fully synced write buffer\n"); >>>>> >>>>> del_timer_sync(&pblk->wtimer); >>>>> - if (pblk->writer_ts) >>>>> + if (pblk->writer_ts) { >>>>> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); >>>>> kthread_stop(pblk->writer_ts); >>>>> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); >>>>> + } >>>>> } >>>>> >>>>> static void pblk_free(struct pblk *pblk) >>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) >>>>> { >>>>> int ret; >>>>> >>>>> + free_cpumask = CPU_MASK_ALL; >>>>> + >>>>> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); >>>>> if (ret) >>>>> return ret; >>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h >>>>> index 381f074..650f983 100644 >>>>> --- a/drivers/lightnvm/pblk.h >>>>> +++ b/drivers/lightnvm/pblk.h >>>>> @@ -690,6 +690,7 @@ struct pblk { >>>>> atomic_t inflight_io; /* General inflight I/O counter */ >>>>> >>>>> struct task_struct *writer_ts; >>>>> + int writer_cpu; >>>>> >>>>> /* Simple translation map of logical addresses to physical addresses. >>>>> * The logical addresses is known by the host system, while the physical >>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c >>>>> index 949e29e..971a19f 100644 >>>>> --- a/drivers/nvme/host/lightnvm.c >>>>> +++ b/drivers/nvme/host/lightnvm.c >>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) >>>>> memcpy(dev->name, disk_name, DISK_NAME_LEN); >>>>> dev->ops = &nvme_nvm_dev_ops; >>>>> dev->private_data = ns; >>>>> + dev->node = node; >>>>> ns->ndev = dev; >>>>> >>>>> return nvm_register(dev); >>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h >>>>> index 5d865a5..312029e 100644 >>>>> --- a/include/linux/lightnvm.h >>>>> +++ b/include/linux/lightnvm.h >>>>> @@ -427,6 +427,8 @@ struct nvm_dev { >>>>> char name[DISK_NAME_LEN]; >>>>> void *private_data; >>>>> >>>>> + int node; >>>>> + >>>>> void *rmap; >>>>> >>>>> struct mutex mlock; >>>>> -- >>>>> 1.8.3.1 >>>> >>>> We have a per-CPU semaphore that only allows to send a single I/O in >>>> order to prevent write pointer violations. Are you seeing this error, or >>>> is it theoretical? >>>> >>>> Javier >>> >>> I meant per PU (parallel unit)… >> >> You are right, for current upstream implementation it's theoretical. I >> saw it after some experimental changes in lightnvm layer. >> >> But I see one more potential benefit from this patch (correct me if I'm >> wrong), after this patch we will be sure that write thread is working on >> cpu from right numa node for device. Now I think, we can also take under >> consideration binding to whole set of cpus from right numa node instead >> of particular one. I can see what you mean, but I think we are better relying on the scheduler balancing the NUMA nodes rather than pinning the write thread to specific CPUs. Are we doing something like this in the writeback path? > > If this is just a potential benefit, I'd rather not do this, at least > not until we have numbers on any performance benefits and an analysis > of any trade offs. > > Thanks, > Hans >> Marcin
On 3/18/19 7:22 PM, Javier González wrote: > >> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote: >> >> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski >> <marcin.dziegielewski@intel.com> wrote: >>> On 3/14/19 3:22 PM, Javier González wrote: >>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote: >>>>> >>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: >>>>>> >>>>>> In some cases write thread migration between cpus can cause >>>>>> sending writes in improper order and in consequence a lot of >>>>>> errors from device. >>>>>> >>>>>> Write thread affinity to particular cpu prevent before it. >>>>>> >>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> >>>>>> --- >>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- >>>>>> drivers/lightnvm/pblk.h | 1 + >>>>>> drivers/nvme/host/lightnvm.c | 1 + >>>>>> include/linux/lightnvm.h | 2 ++ >>>>>> 4 files changed, 33 insertions(+), 1 deletion(-) >>>>>> >>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >>>>>> index 81e8ed4..bd25004 100644 >>>>>> --- a/drivers/lightnvm/pblk-init.c >>>>>> +++ b/drivers/lightnvm/pblk-init.c >>>>>> @@ -21,6 +21,8 @@ >>>>>> >>>>>> #include "pblk.h" >>>>>> #include "pblk-trace.h" >>>>>> +#include <linux/cpumask.h> >>>>>> +#include <linux/numa.h> >>>>>> >>>>>> static unsigned int write_buffer_size; >>>>>> >>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches { >>>>>> >>>>>> struct bio_set pblk_bio_set; >>>>>> >>>>>> +cpumask_t free_cpumask; >>>>>> + >>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, >>>>>> struct bio *bio) >>>>>> { >>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) >>>>>> >>>>>> static int pblk_writer_init(struct pblk *pblk) >>>>>> { >>>>>> + cpumask_t tmp_cpumask, cpumask; >>>>>> + int cpu; >>>>>> + >>>>>> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); >>>>>> if (IS_ERR(pblk->writer_ts)) { >>>>>> int err = PTR_ERR(pblk->writer_ts); >>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) >>>>>> return err; >>>>>> } >>>>>> >>>>>> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), >>>>>> + cpu_online_mask); >>>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>>> + >>>>>> + if (!cpumask_weight(&free_cpumask)) { >>>>>> + free_cpumask = CPU_MASK_ALL; >>>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>>> + } >>>>>> + >>>>>> + cpu = cpumask_last(&cpumask); >>>>>> + >>>>>> + kthread_bind(pblk->writer_ts, cpu); >>>>>> + >>>>>> + cpumask_clear_cpu(cpu, &free_cpumask); >>>>>> + pblk->writer_cpu = cpu; >>>>>> + >>>>>> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); >>>>>> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); >>>>>> >>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) >>>>>> "Stopping not fully synced write buffer\n"); >>>>>> >>>>>> del_timer_sync(&pblk->wtimer); >>>>>> - if (pblk->writer_ts) >>>>>> + if (pblk->writer_ts) { >>>>>> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); >>>>>> kthread_stop(pblk->writer_ts); >>>>>> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); >>>>>> + } >>>>>> } >>>>>> >>>>>> static void pblk_free(struct pblk *pblk) >>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) >>>>>> { >>>>>> int ret; >>>>>> >>>>>> + free_cpumask = CPU_MASK_ALL; >>>>>> + >>>>>> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); >>>>>> if (ret) >>>>>> return ret; >>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h >>>>>> index 381f074..650f983 100644 >>>>>> --- a/drivers/lightnvm/pblk.h >>>>>> +++ b/drivers/lightnvm/pblk.h >>>>>> @@ -690,6 +690,7 @@ struct pblk { >>>>>> atomic_t inflight_io; /* General inflight I/O counter */ >>>>>> >>>>>> struct task_struct *writer_ts; >>>>>> + int writer_cpu; >>>>>> >>>>>> /* Simple translation map of logical addresses to physical addresses. >>>>>> * The logical addresses is known by the host system, while the physical >>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c >>>>>> index 949e29e..971a19f 100644 >>>>>> --- a/drivers/nvme/host/lightnvm.c >>>>>> +++ b/drivers/nvme/host/lightnvm.c >>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) >>>>>> memcpy(dev->name, disk_name, DISK_NAME_LEN); >>>>>> dev->ops = &nvme_nvm_dev_ops; >>>>>> dev->private_data = ns; >>>>>> + dev->node = node; >>>>>> ns->ndev = dev; >>>>>> >>>>>> return nvm_register(dev); >>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h >>>>>> index 5d865a5..312029e 100644 >>>>>> --- a/include/linux/lightnvm.h >>>>>> +++ b/include/linux/lightnvm.h >>>>>> @@ -427,6 +427,8 @@ struct nvm_dev { >>>>>> char name[DISK_NAME_LEN]; >>>>>> void *private_data; >>>>>> >>>>>> + int node; >>>>>> + >>>>>> void *rmap; >>>>>> >>>>>> struct mutex mlock; >>>>>> -- >>>>>> 1.8.3.1 >>>>> >>>>> We have a per-CPU semaphore that only allows to send a single I/O in >>>>> order to prevent write pointer violations. Are you seeing this error, or >>>>> is it theoretical? >>>>> >>>>> Javier >>>> >>>> I meant per PU (parallel unit)… >>> >>> You are right, for current upstream implementation it's theoretical. I >>> saw it after some experimental changes in lightnvm layer. >>> >>> But I see one more potential benefit from this patch (correct me if I'm >>> wrong), after this patch we will be sure that write thread is working on >>> cpu from right numa node for device. Now I think, we can also take under >>> consideration binding to whole set of cpus from right numa node instead >>> of particular one. > > I can see what you mean, but I think we are better relying on the > scheduler balancing the NUMA nodes rather than pinning the write thread > to specific CPUs. Are we doing something like this in the writeback > path? > I have checked wrtieback path and I didn't find nothing similar, so please disregard this patch. Marcin >> >> If this is just a potential benefit, I'd rather not do this, at least >> not until we have numbers on any performance benefits and an analysis >> of any trade offs. >> >> Thanks, >> Hans >>> Marcin
> On 19 Mar 2019, at 14.33, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: > > > > On 3/18/19 7:22 PM, Javier González wrote: >>> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote: >>> >>> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski >>> <marcin.dziegielewski@intel.com> wrote: >>>> On 3/14/19 3:22 PM, Javier González wrote: >>>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote: >>>>>> >>>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote: >>>>>>> >>>>>>> In some cases write thread migration between cpus can cause >>>>>>> sending writes in improper order and in consequence a lot of >>>>>>> errors from device. >>>>>>> >>>>>>> Write thread affinity to particular cpu prevent before it. >>>>>>> >>>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> >>>>>>> --- >>>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- >>>>>>> drivers/lightnvm/pblk.h | 1 + >>>>>>> drivers/nvme/host/lightnvm.c | 1 + >>>>>>> include/linux/lightnvm.h | 2 ++ >>>>>>> 4 files changed, 33 insertions(+), 1 deletion(-) >>>>>>> >>>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >>>>>>> index 81e8ed4..bd25004 100644 >>>>>>> --- a/drivers/lightnvm/pblk-init.c >>>>>>> +++ b/drivers/lightnvm/pblk-init.c >>>>>>> @@ -21,6 +21,8 @@ >>>>>>> >>>>>>> #include "pblk.h" >>>>>>> #include "pblk-trace.h" >>>>>>> +#include <linux/cpumask.h> >>>>>>> +#include <linux/numa.h> >>>>>>> >>>>>>> static unsigned int write_buffer_size; >>>>>>> >>>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches { >>>>>>> >>>>>>> struct bio_set pblk_bio_set; >>>>>>> >>>>>>> +cpumask_t free_cpumask; >>>>>>> + >>>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, >>>>>>> struct bio *bio) >>>>>>> { >>>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) >>>>>>> >>>>>>> static int pblk_writer_init(struct pblk *pblk) >>>>>>> { >>>>>>> + cpumask_t tmp_cpumask, cpumask; >>>>>>> + int cpu; >>>>>>> + >>>>>>> pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); >>>>>>> if (IS_ERR(pblk->writer_ts)) { >>>>>>> int err = PTR_ERR(pblk->writer_ts); >>>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) >>>>>>> return err; >>>>>>> } >>>>>>> >>>>>>> + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), >>>>>>> + cpu_online_mask); >>>>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>>>> + >>>>>>> + if (!cpumask_weight(&free_cpumask)) { >>>>>>> + free_cpumask = CPU_MASK_ALL; >>>>>>> + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); >>>>>>> + } >>>>>>> + >>>>>>> + cpu = cpumask_last(&cpumask); >>>>>>> + >>>>>>> + kthread_bind(pblk->writer_ts, cpu); >>>>>>> + >>>>>>> + cpumask_clear_cpu(cpu, &free_cpumask); >>>>>>> + pblk->writer_cpu = cpu; >>>>>>> + >>>>>>> timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); >>>>>>> mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); >>>>>>> >>>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) >>>>>>> "Stopping not fully synced write buffer\n"); >>>>>>> >>>>>>> del_timer_sync(&pblk->wtimer); >>>>>>> - if (pblk->writer_ts) >>>>>>> + if (pblk->writer_ts) { >>>>>>> + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); >>>>>>> kthread_stop(pblk->writer_ts); >>>>>>> + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); >>>>>>> + } >>>>>>> } >>>>>>> >>>>>>> static void pblk_free(struct pblk *pblk) >>>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) >>>>>>> { >>>>>>> int ret; >>>>>>> >>>>>>> + free_cpumask = CPU_MASK_ALL; >>>>>>> + >>>>>>> ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); >>>>>>> if (ret) >>>>>>> return ret; >>>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h >>>>>>> index 381f074..650f983 100644 >>>>>>> --- a/drivers/lightnvm/pblk.h >>>>>>> +++ b/drivers/lightnvm/pblk.h >>>>>>> @@ -690,6 +690,7 @@ struct pblk { >>>>>>> atomic_t inflight_io; /* General inflight I/O counter */ >>>>>>> >>>>>>> struct task_struct *writer_ts; >>>>>>> + int writer_cpu; >>>>>>> >>>>>>> /* Simple translation map of logical addresses to physical addresses. >>>>>>> * The logical addresses is known by the host system, while the physical >>>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c >>>>>>> index 949e29e..971a19f 100644 >>>>>>> --- a/drivers/nvme/host/lightnvm.c >>>>>>> +++ b/drivers/nvme/host/lightnvm.c >>>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) >>>>>>> memcpy(dev->name, disk_name, DISK_NAME_LEN); >>>>>>> dev->ops = &nvme_nvm_dev_ops; >>>>>>> dev->private_data = ns; >>>>>>> + dev->node = node; >>>>>>> ns->ndev = dev; >>>>>>> >>>>>>> return nvm_register(dev); >>>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h >>>>>>> index 5d865a5..312029e 100644 >>>>>>> --- a/include/linux/lightnvm.h >>>>>>> +++ b/include/linux/lightnvm.h >>>>>>> @@ -427,6 +427,8 @@ struct nvm_dev { >>>>>>> char name[DISK_NAME_LEN]; >>>>>>> void *private_data; >>>>>>> >>>>>>> + int node; >>>>>>> + >>>>>>> void *rmap; >>>>>>> >>>>>>> struct mutex mlock; >>>>>>> -- >>>>>>> 1.8.3.1 >>>>>> >>>>>> We have a per-CPU semaphore that only allows to send a single I/O in >>>>>> order to prevent write pointer violations. Are you seeing this error, or >>>>>> is it theoretical? >>>>>> >>>>>> Javier >>>>> >>>>> I meant per PU (parallel unit)… >>>> >>>> You are right, for current upstream implementation it's theoretical. I >>>> saw it after some experimental changes in lightnvm layer. >>>> >>>> But I see one more potential benefit from this patch (correct me if I'm >>>> wrong), after this patch we will be sure that write thread is working on >>>> cpu from right numa node for device. Now I think, we can also take under >>>> consideration binding to whole set of cpus from right numa node instead >>>> of particular one. >> I can see what you mean, but I think we are better relying on the >> scheduler balancing the NUMA nodes rather than pinning the write thread >> to specific CPUs. Are we doing something like this in the writeback >> path? > I have checked wrtieback path and I didn't find nothing similar, so please disregard this patch. > > Marcin > Don’t get me wrong. I think it is a good motivation, but maybe it should be implemented at a different level. Thanks, Javier
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 81e8ed4..bd25004 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -21,6 +21,8 @@ #include "pblk.h" #include "pblk-trace.h" +#include <linux/cpumask.h> +#include <linux/numa.h> static unsigned int write_buffer_size; @@ -47,6 +49,8 @@ struct pblk_global_caches { struct bio_set pblk_bio_set; +cpumask_t free_cpumask; + static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, struct bio *bio) { @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk) static int pblk_writer_init(struct pblk *pblk) { + cpumask_t tmp_cpumask, cpumask; + int cpu; + pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); if (IS_ERR(pblk->writer_ts)) { int err = PTR_ERR(pblk->writer_ts); @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk) return err; } + cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node), + cpu_online_mask); + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); + + if (!cpumask_weight(&free_cpumask)) { + free_cpumask = CPU_MASK_ALL; + cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask); + } + + cpu = cpumask_last(&cpumask); + + kthread_bind(pblk->writer_ts, cpu); + + cpumask_clear_cpu(cpu, &free_cpumask); + pblk->writer_cpu = cpu; + timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk) "Stopping not fully synced write buffer\n"); del_timer_sync(&pblk->wtimer); - if (pblk->writer_ts) + if (pblk->writer_ts) { + set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask); kthread_stop(pblk->writer_ts); + cpumask_set_cpu(pblk->writer_cpu, &free_cpumask); + } } static void pblk_free(struct pblk *pblk) @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void) { int ret; + free_cpumask = CPU_MASK_ALL; + ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); if (ret) return ret; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 381f074..650f983 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -690,6 +690,7 @@ struct pblk { atomic_t inflight_io; /* General inflight I/O counter */ struct task_struct *writer_ts; + int writer_cpu; /* Simple translation map of logical addresses to physical addresses. * The logical addresses is known by the host system, while the physical diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 949e29e..971a19f 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) memcpy(dev->name, disk_name, DISK_NAME_LEN); dev->ops = &nvme_nvm_dev_ops; dev->private_data = ns; + dev->node = node; ns->ndev = dev; return nvm_register(dev); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 5d865a5..312029e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -427,6 +427,8 @@ struct nvm_dev { char name[DISK_NAME_LEN]; void *private_data; + int node; + void *rmap; struct mutex mlock;
In some cases write thread migration between cpus can cause sending writes in improper order and in consequence a lot of errors from device. Write thread affinity to particular cpu prevent before it. Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com> --- drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++- drivers/lightnvm/pblk.h | 1 + drivers/nvme/host/lightnvm.c | 1 + include/linux/lightnvm.h | 2 ++ 4 files changed, 33 insertions(+), 1 deletion(-)