diff mbox

[v2,5/5] libnvdimm: add DMA support for pmem blk-mq

Message ID 150169928551.59677.14690799553760064519.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Jiang Aug. 2, 2017, 6:41 p.m. UTC
Adding DMA support for pmem blk reads. This provides signficant CPU
reduction with large memory reads with good performance. DMAs are triggered
with test against bio_multiple_segment(), so the small I/Os (4k or less?)
are still performed by the CPU in order to reduce latency. By default
the pmem driver will be using blk-mq with DMA.

Numbers below are measured against pmem simulated via DRAM using
memmap=NN!SS.  DMA engine used is the ioatdma on Intel Skylake Xeon
platform.  Keep in mind the performance for actual persistent memory
will differ.
Fio 2.21 was used.

64k: 1 task queuedepth=1
CPU Read:  7631 MB/s  99.7% CPU    DMA Read: 2415 MB/s  54% CPU
CPU Write: 3552 MB/s  100% CPU     DMA Write 2173 MB/s  54% CPU

64k: 16 tasks queuedepth=16
CPU Read: 36800 MB/s  1593% CPU    DMA Read:  29100 MB/s  607% CPU
CPU Write 20900 MB/s  1589% CPU    DMA Write: 23400 MB/s  585% CPU

2M: 1 task queuedepth=1
CPU Read:  6013 MB/s  99.3% CPU    DMA Read:  7986 MB/s  59.3% CPU
CPU Write: 3579 MB/s  100% CPU     DMA Write: 5211 MB/s  58.3% CPU

2M: 16 tasks queuedepth=16
CPU Read:  18100 MB/s 1588% CPU    DMA Read:  21300 MB/s 180.9% CPU
CPU Write: 14100 MB/s 1594% CPU    DMA Write: 20400 MB/s 446.9% CPU

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/nvdimm/pmem.c |  214 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 204 insertions(+), 10 deletions(-)

Comments

Sinan Kaya Aug. 2, 2017, 7:22 p.m. UTC | #1
On 8/2/2017 2:41 PM, Dave Jiang wrote:
>  	if (queue_mode == PMEM_Q_MQ) {
> +		chan = dma_find_channel(DMA_MEMCPY);
> +		if (!chan) {
> +			queue_mode = PMEM_Q_BIO;
> +			dev_warn(dev, "Forced back to PMEM_Q_BIO, no DMA\n");
> +		}

We can't expect all MEMCPY capable hardware to support this feature, right?

Do we need a new API / new function, or new capability?

> +	}
Dave Jiang Aug. 2, 2017, 8:52 p.m. UTC | #2
On 08/02/2017 12:22 PM, Sinan Kaya wrote:
> On 8/2/2017 2:41 PM, Dave Jiang wrote:
>>  	if (queue_mode == PMEM_Q_MQ) {
>> +		chan = dma_find_channel(DMA_MEMCPY);
>> +		if (!chan) {
>> +			queue_mode = PMEM_Q_BIO;
>> +			dev_warn(dev, "Forced back to PMEM_Q_BIO, no DMA\n");
>> +		}
> 
> We can't expect all MEMCPY capable hardware to support this feature, right?
> 
> Do we need a new API / new function, or new capability?

Hmmm...you are right. I wonder if we need something like DMA_SG cap....


> 
>> +	}
> 
>
Sinan Kaya Aug. 2, 2017, 9:10 p.m. UTC | #3
On 8/2/2017 4:52 PM, Dave Jiang wrote:
>> Do we need a new API / new function, or new capability?
> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
> 
> 

Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
to be similar with DMA_MEMSET_SG.

enum dma_transaction_type {
	DMA_MEMCPY,
	DMA_XOR,
	DMA_PQ,
	DMA_XOR_VAL,
	DMA_PQ_VAL,
	DMA_MEMSET,
	DMA_MEMSET_SG,
	DMA_INTERRUPT,
	DMA_SG,
	DMA_PRIVATE,
	DMA_ASYNC_TX,
	DMA_SLAVE,
	DMA_CYCLIC,
	DMA_INTERLEAVE,
/* last transaction type for creation of the capabilities mask */
	DMA_TX_TYPE_END,
};
Dave Jiang Aug. 2, 2017, 9:13 p.m. UTC | #4
On 08/02/2017 02:10 PM, Sinan Kaya wrote:
> On 8/2/2017 4:52 PM, Dave Jiang wrote:
>>> Do we need a new API / new function, or new capability?
>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>
>>
> 
> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
> to be similar with DMA_MEMSET_SG.

I'm ok with that if Vinod is.

> 
> enum dma_transaction_type {
> 	DMA_MEMCPY,
> 	DMA_XOR,
> 	DMA_PQ,
> 	DMA_XOR_VAL,
> 	DMA_PQ_VAL,
> 	DMA_MEMSET,
> 	DMA_MEMSET_SG,
> 	DMA_INTERRUPT,
> 	DMA_SG,
> 	DMA_PRIVATE,
> 	DMA_ASYNC_TX,
> 	DMA_SLAVE,
> 	DMA_CYCLIC,
> 	DMA_INTERLEAVE,
> /* last transaction type for creation of the capabilities mask */
> 	DMA_TX_TYPE_END,
> };
>
Vinod Koul Aug. 3, 2017, 5:01 a.m. UTC | #5
On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
> 
> 
> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
> > On 8/2/2017 4:52 PM, Dave Jiang wrote:
> >>> Do we need a new API / new function, or new capability?
> >> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
> >>
> >>
> > 
> > Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
> > to be similar with DMA_MEMSET_SG.
> 
> I'm ok with that if Vinod is.

So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
or all :). We should have done bitfields for this though...

> 
> > 
> > enum dma_transaction_type {
> > 	DMA_MEMCPY,
> > 	DMA_XOR,
> > 	DMA_PQ,
> > 	DMA_XOR_VAL,
> > 	DMA_PQ_VAL,
> > 	DMA_MEMSET,
> > 	DMA_MEMSET_SG,
> > 	DMA_INTERRUPT,
> > 	DMA_SG,
> > 	DMA_PRIVATE,
> > 	DMA_ASYNC_TX,
> > 	DMA_SLAVE,
> > 	DMA_CYCLIC,
> > 	DMA_INTERLEAVE,
> > /* last transaction type for creation of the capabilities mask */
> > 	DMA_TX_TYPE_END,
> > };
> >
Dave Jiang Aug. 3, 2017, 5:11 a.m. UTC | #6
> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> 
>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
>> 
>> 
>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
>>>>> Do we need a new API / new function, or new capability?
>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>>> 
>>>> 
>>> 
>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>>> to be similar with DMA_MEMSET_SG.
>> 
>> I'm ok with that if Vinod is.
> 
> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
> or all :). We should have done bitfields for this though...

Add DMA_MEMCPY_SG to transaction type. 

> 
>> 
>>> 
>>> enum dma_transaction_type {
>>>    DMA_MEMCPY,
>>>    DMA_XOR,
>>>    DMA_PQ,
>>>    DMA_XOR_VAL,
>>>    DMA_PQ_VAL,
>>>    DMA_MEMSET,
>>>    DMA_MEMSET_SG,
>>>    DMA_INTERRUPT,
>>>    DMA_SG,
>>>    DMA_PRIVATE,
>>>    DMA_ASYNC_TX,
>>>    DMA_SLAVE,
>>>    DMA_CYCLIC,
>>>    DMA_INTERLEAVE,
>>> /* last transaction type for creation of the capabilities mask */
>>>    DMA_TX_TYPE_END,
>>> };
>>> 
> 
> -- 
> ~Vinod
Vinod Koul Aug. 3, 2017, 5:28 a.m. UTC | #7
On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
> 
> 
> > On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> > 
> >> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
> >> 
> >> 
> >>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
> >>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
> >>>>> Do we need a new API / new function, or new capability?
> >>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
> >>>> 
> >>>> 
> >>> 
> >>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
> >>> to be similar with DMA_MEMSET_SG.
> >> 
> >> I'm ok with that if Vinod is.
> > 
> > So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
> > or all :). We should have done bitfields for this though...
> 
> Add DMA_MEMCPY_SG to transaction type. 

Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
scatterlist to scatterlist copy which is used to check for
device_prep_dma_sg() calls
Dave Jiang Aug. 3, 2017, 5:36 a.m. UTC | #8
> On Aug 2, 2017, at 10:25 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> 
>> On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
>> 
>> 
>>>> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
>>>> 
>>>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
>>>> 
>>>> 
>>>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
>>>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
>>>>>>> Do we need a new API / new function, or new capability?
>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>>>>> 
>>>>>> 
>>>>> 
>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>>>>> to be similar with DMA_MEMSET_SG.
>>>> 
>>>> I'm ok with that if Vinod is.
>>> 
>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>>> or all :). We should have done bitfields for this though...
>> 
>> Add DMA_MEMCPY_SG to transaction type. 
> 
> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
> scatterlist to scatterlist copy which is used to check for
> device_prep_dma_sg() calls
> 
Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So we need something separate than what DMA_SG is used for. 


> -- 
> ~Vinod
Vinod Koul Aug. 3, 2017, 8:59 a.m. UTC | #9
On Thu, Aug 03, 2017 at 11:06:13AM +0530, Jiang, Dave wrote:
> 
> 
> > On Aug 2, 2017, at 10:25 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> > 
> >> On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
> >> 
> >> 
> >>>> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> >>>> 
> >>>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
> >>>> 
> >>>> 
> >>>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
> >>>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
> >>>>>>> Do we need a new API / new function, or new capability?
> >>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
> >>>>>> 
> >>>>>> 
> >>>>> 
> >>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
> >>>>> to be similar with DMA_MEMSET_SG.
> >>>> 
> >>>> I'm ok with that if Vinod is.
> >>> 
> >>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
> >>> or all :). We should have done bitfields for this though...
> >> 
> >> Add DMA_MEMCPY_SG to transaction type. 
> > 
> > Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
> > scatterlist to scatterlist copy which is used to check for
> > device_prep_dma_sg() calls
> > 
> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
> we need something separate than what DMA_SG is used for. 

Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
since it is not memset (or is it) I would not call it memset, or maybe we
should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
Dave Jiang Aug. 3, 2017, 2:36 p.m. UTC | #10
> On Aug 3, 2017, at 1:56 AM, Koul, Vinod <vinod.koul@intel.com> wrote:
> 
>> On Thu, Aug 03, 2017 at 11:06:13AM +0530, Jiang, Dave wrote:
>> 
>> 
>>>> On Aug 2, 2017, at 10:25 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
>>>> 
>>>> On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
>>>> 
>>>> 
>>>>>> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
>>>>>> 
>>>>>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
>>>>>> 
>>>>>> 
>>>>>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
>>>>>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
>>>>>>>>> Do we need a new API / new function, or new capability?
>>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>>>>>>> 
>>>>>>>> 
>>>>>>> 
>>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>>>>>>> to be similar with DMA_MEMSET_SG.
>>>>>> 
>>>>>> I'm ok with that if Vinod is.
>>>>> 
>>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>>>>> or all :). We should have done bitfields for this though...
>>>> 
>>>> Add DMA_MEMCPY_SG to transaction type. 
>>> 
>>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
>>> scatterlist to scatterlist copy which is used to check for
>>> device_prep_dma_sg() calls
>>> 
>> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
>> we need something separate than what DMA_SG is used for. 
> 
> Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
> since it is not memset (or is it) I would not call it memset, or maybe we
> should also change DMA_SG to DMA_SG_SG to make it terribly clear :D

I can create patches for both. 

> 
> -- 
> ~Vinod
Vinod Koul Aug. 3, 2017, 3:55 p.m. UTC | #11
On Thu, Aug 03, 2017 at 08:06:07PM +0530, Jiang, Dave wrote:
> 
> 
> > On Aug 3, 2017, at 1:56 AM, Koul, Vinod <vinod.koul@intel.com> wrote:
> > 
> >> On Thu, Aug 03, 2017 at 11:06:13AM +0530, Jiang, Dave wrote:
> >> 
> >> 
> >>>> On Aug 2, 2017, at 10:25 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> >>>> 
> >>>> On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
> >>>> 
> >>>> 
> >>>>>> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> >>>>>> 
> >>>>>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
> >>>>>> 
> >>>>>> 
> >>>>>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
> >>>>>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
> >>>>>>>>> Do we need a new API / new function, or new capability?
> >>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
> >>>>>>>> 
> >>>>>>>> 
> >>>>>>> 
> >>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
> >>>>>>> to be similar with DMA_MEMSET_SG.
> >>>>>> 
> >>>>>> I'm ok with that if Vinod is.
> >>>>> 
> >>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
> >>>>> or all :). We should have done bitfields for this though...
> >>>> 
> >>>> Add DMA_MEMCPY_SG to transaction type. 
> >>> 
> >>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
> >>> scatterlist to scatterlist copy which is used to check for
> >>> device_prep_dma_sg() calls
> >>> 
> >> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
> >> we need something separate than what DMA_SG is used for. 
> > 
> > Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
> > since it is not memset (or is it) I would not call it memset, or maybe we
> > should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
> 
> I can create patches for both. 

Great, anyone who disagrees or can give better names :)
Dan Williams Aug. 3, 2017, 4:14 p.m. UTC | #12
On Thu, Aug 3, 2017 at 8:55 AM, Vinod Koul <vinod.koul@intel.com> wrote:
> On Thu, Aug 03, 2017 at 08:06:07PM +0530, Jiang, Dave wrote:
>>
>>
>> > On Aug 3, 2017, at 1:56 AM, Koul, Vinod <vinod.koul@intel.com> wrote:
>> >
>> >> On Thu, Aug 03, 2017 at 11:06:13AM +0530, Jiang, Dave wrote:
>> >>
>> >>
>> >>>> On Aug 2, 2017, at 10:25 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
>> >>>>
>> >>>> On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
>> >>>>
>> >>>>
>> >>>>>> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
>> >>>>>>
>> >>>>>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
>> >>>>>>
>> >>>>>>
>> >>>>>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
>> >>>>>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
>> >>>>>>>>> Do we need a new API / new function, or new capability?
>> >>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>> >>>>>>>>
>> >>>>>>>>
>> >>>>>>>
>> >>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>> >>>>>>> to be similar with DMA_MEMSET_SG.
>> >>>>>>
>> >>>>>> I'm ok with that if Vinod is.
>> >>>>>
>> >>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>> >>>>> or all :). We should have done bitfields for this though...
>> >>>>
>> >>>> Add DMA_MEMCPY_SG to transaction type.
>> >>>
>> >>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
>> >>> scatterlist to scatterlist copy which is used to check for
>> >>> device_prep_dma_sg() calls
>> >>>
>> >> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
>> >> we need something separate than what DMA_SG is used for.
>> >
>> > Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
>> > since it is not memset (or is it) I would not call it memset, or maybe we
>> > should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
>>
>> I can create patches for both.
>
> Great, anyone who disagrees or can give better names :)
>

All my suggestions would involve a lot more work. If we had infinite
time we'd stop with the per-operation-type entry points and make this
look like a typical driver sub-system that takes commands like
block-devices or usb, but perhaps that ship has sailed.
Dave Jiang Aug. 3, 2017, 5:07 p.m. UTC | #13
On 08/03/2017 09:14 AM, Dan Williams wrote:
> On Thu, Aug 3, 2017 at 8:55 AM, Vinod Koul <vinod.koul@intel.com> wrote:
>> On Thu, Aug 03, 2017 at 08:06:07PM +0530, Jiang, Dave wrote:
>>>
>>>
>>>> On Aug 3, 2017, at 1:56 AM, Koul, Vinod <vinod.koul@intel.com> wrote:
>>>>
>>>>> On Thu, Aug 03, 2017 at 11:06:13AM +0530, Jiang, Dave wrote:
>>>>>
>>>>>
>>>>>>> On Aug 2, 2017, at 10:25 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
>>>>>>>
>>>>>>> On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
>>>>>>>
>>>>>>>
>>>>>>>>> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
>>>>>>>>>
>>>>>>>>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
>>>>>>>>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
>>>>>>>>>>>> Do we need a new API / new function, or new capability?
>>>>>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>>>>>>>>>> to be similar with DMA_MEMSET_SG.
>>>>>>>>>
>>>>>>>>> I'm ok with that if Vinod is.
>>>>>>>>
>>>>>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>>>>>>>> or all :). We should have done bitfields for this though...
>>>>>>>
>>>>>>> Add DMA_MEMCPY_SG to transaction type.
>>>>>>
>>>>>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
>>>>>> scatterlist to scatterlist copy which is used to check for
>>>>>> device_prep_dma_sg() calls
>>>>>>
>>>>> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
>>>>> we need something separate than what DMA_SG is used for.
>>>>
>>>> Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
>>>> since it is not memset (or is it) I would not call it memset, or maybe we
>>>> should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
>>>
>>> I can create patches for both.
>>
>> Great, anyone who disagrees or can give better names :)
>>
> 
> All my suggestions would involve a lot more work. If we had infinite
> time we'd stop with the per-operation-type entry points and make this
> look like a typical driver sub-system that takes commands like
> block-devices or usb, but perhaps that ship has sailed.
> 

Allen, isn't this what we were just talking about on IRC yesterday?

<allenbh> I dislike prep_tx grabbing a device-specific descriptor.  The
device really only needs as many device-specific descriptors will keep
the hw busy, and just let the logical layer queue up logical descriptors
until hw descriptors become available.  Let the client allocate
descriptors to submit, and hw driver can translate to a
hardware-specific descriptor at the last moment before notifying the hw.
Allen Hubbe Aug. 3, 2017, 6:35 p.m. UTC | #14
From: Dave Jiang
> On 08/03/2017 09:14 AM, Dan Williams wrote:
> > On Thu, Aug 3, 2017 at 8:55 AM, Vinod Koul <vinod.koul@intel.com> wrote:
> >> On Thu, Aug 03, 2017 at 08:06:07PM +0530, Jiang, Dave wrote:
> >>>> On Aug 3, 2017, at 1:56 AM, Koul, Vinod <vinod.koul@intel.com> wrote:
> >>>>> On Thu, Aug 03, 2017 at 11:06:13AM +0530, Jiang, Dave wrote:
> >>>>>>> On Aug 2, 2017, at 10:25 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> >>>>>>> On Thu, Aug 03, 2017 at 10:41:51AM +0530, Jiang, Dave wrote:
> >>>>>>>>> On Aug 2, 2017, at 9:58 PM, Koul, Vinod <vinod.koul@intel.com> wrote:
> >>>>>>>>> On Wed, Aug 02, 2017 at 02:13:56PM -0700, Dave Jiang wrote:
> >>>>>>>>>> On 08/02/2017 02:10 PM, Sinan Kaya wrote:
> >>>>>>>>>> On 8/2/2017 4:52 PM, Dave Jiang wrote:
> >>>>>>>>>>>> Do we need a new API / new function, or new capability?
> >>>>>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
> >>>>>>>>>>
> >>>>>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
> >>>>>>>>>> to be similar with DMA_MEMSET_SG.
> >>>>>>>>>
> >>>>>>>>> I'm ok with that if Vinod is.
> >>>>>>>>
> >>>>>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
> >>>>>>>> or all :). We should have done bitfields for this though...
> >>>>>>>
> >>>>>>> Add DMA_MEMCPY_SG to transaction type.
> >>>>>>
> >>>>>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
> >>>>>> scatterlist to scatterlist copy which is used to check for
> >>>>>> device_prep_dma_sg() calls
> >>>>>>
> >>>>> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
> >>>>> we need something separate than what DMA_SG is used for.
> >>>>
> >>>> Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
> >>>> since it is not memset (or is it) I would not call it memset, or maybe we
> >>>> should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
> >>>
> >>> I can create patches for both.
> >>
> >> Great, anyone who disagrees or can give better names :)
> >
> > All my suggestions would involve a lot more work. If we had infinite
> > time we'd stop with the per-operation-type entry points and make this
> > look like a typical driver sub-system that takes commands like
> > block-devices or usb, but perhaps that ship has sailed.
> 
> Allen, isn't this what we were just talking about on IRC yesterday?
> 
> <allenbh> I dislike prep_tx grabbing a device-specific descriptor.  The
> device really only needs as many device-specific descriptors will keep
> the hw busy, and just let the logical layer queue up logical descriptors
> until hw descriptors become available.  Let the client allocate
> descriptors to submit, and hw driver can translate to a
> hardware-specific descriptor at the last moment before notifying the hw.

Yeah, that last part, "like a typical driver sub-system that takes commands" +1!

Especially if the client can submit a list of commands.  I have an rdma driver using the dmaengine api, but it would be more natural to build up a list of dma operations, just using my driver's own memory and the struct definition of some abstract dma descriptor, not calling any functions of the dmaengine api.  After building the list of descriptors, submit the entire list all at once to the hardware driver, or not at all.  Instead, in perf I see a lot of heat on that spinlock for each individual dma prep in ioat (sorry for picking on you, Dave, this is the only dma engine hw and driver that I am familiar with).

Also, I think we could glean some more efficiency if the dma completion path took a hint from napi_schedule() and napi->poll(), instead of scheduling a tasklet directly in the dmaengine hw driver.  For one thing, it could reduce the number of interrupts.  Also, coordinating the servicing of dma completions with the posting of new work, with the schedule determined from the top of the stack down, could further reduce contention on that lock between dma prep and cleanup/complete.

http://elixir.free-electrons.com/linux/v4.12.4/source/drivers/dma/ioat/dma.c#L448
http://elixir.free-electrons.com/linux/v4.12.4/source/drivers/dma/ioat/dma.c#L652

Also, I apologize for not offering to do all this work.  If I was ready to jump in do it, I would have spoken up earlier.
Ross Zwisler Aug. 3, 2017, 8:20 p.m. UTC | #15
On Wed, Aug 02, 2017 at 11:41:25AM -0700, Dave Jiang wrote:
> Adding DMA support for pmem blk reads. This provides signficant CPU
> reduction with large memory reads with good performance. DMAs are triggered
> with test against bio_multiple_segment(), so the small I/Os (4k or less?)
> are still performed by the CPU in order to reduce latency. By default
> the pmem driver will be using blk-mq with DMA.
> 
> Numbers below are measured against pmem simulated via DRAM using
> memmap=NN!SS.  DMA engine used is the ioatdma on Intel Skylake Xeon
> platform.  Keep in mind the performance for actual persistent memory
> will differ.
> Fio 2.21 was used.
> 
> 64k: 1 task queuedepth=1
> CPU Read:  7631 MB/s  99.7% CPU    DMA Read: 2415 MB/s  54% CPU
> CPU Write: 3552 MB/s  100% CPU     DMA Write 2173 MB/s  54% CPU
> 
> 64k: 16 tasks queuedepth=16
> CPU Read: 36800 MB/s  1593% CPU    DMA Read:  29100 MB/s  607% CPU
> CPU Write 20900 MB/s  1589% CPU    DMA Write: 23400 MB/s  585% CPU
> 
> 2M: 1 task queuedepth=1
> CPU Read:  6013 MB/s  99.3% CPU    DMA Read:  7986 MB/s  59.3% CPU
> CPU Write: 3579 MB/s  100% CPU     DMA Write: 5211 MB/s  58.3% CPU
> 
> 2M: 16 tasks queuedepth=16
> CPU Read:  18100 MB/s 1588% CPU    DMA Read:  21300 MB/s 180.9% CPU
> CPU Write: 14100 MB/s 1594% CPU    DMA Write: 20400 MB/s 446.9% CPU
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>

<>

> +static int pmem_handle_cmd_dma(struct pmem_cmd *cmd, bool is_write)
> +{
...
> +err_set_unmap:
> +	dmaengine_unmap_put(unmap);
> +err_unmap_buffer:
> +	dma_unmap_page(dev, dma_addr, len, dir);
> +err_unmap_sg:
> +	if (dir == DMA_TO_DEVICE)
> +		dir = DMA_FROM_DEVICE;
> +	else
> +		dir = DMA_TO_DEVICE;
> +	dma_unmap_sg(dev, cmd->sg, cmd->sg_nents, dir);
> +	dmaengine_unmap_put(unmap);
> +err:
> +	blk_mq_end_request(cmd->rq, -ENXIO);

Should this be:

	blk_mq_end_request(cmd->rq, rc);

?

Otherwise:

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

> +	return rc;
> +}
> +
Vinod Koul Aug. 16, 2017, 4:50 p.m. UTC | #16
On Thu, Aug 03, 2017 at 09:14:13AM -0700, Dan Williams wrote:
> >> >>>>>>>>> Do we need a new API / new function, or new capability?
> >> >>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
> >> >>>>>>>>
> >> >>>>>>>>
> >> >>>>>>>
> >> >>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
> >> >>>>>>> to be similar with DMA_MEMSET_SG.
> >> >>>>>>
> >> >>>>>> I'm ok with that if Vinod is.
> >> >>>>>
> >> >>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
> >> >>>>> or all :). We should have done bitfields for this though...
> >> >>>>
> >> >>>> Add DMA_MEMCPY_SG to transaction type.
> >> >>>
> >> >>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
> >> >>> scatterlist to scatterlist copy which is used to check for
> >> >>> device_prep_dma_sg() calls
> >> >>>
> >> >> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
> >> >> we need something separate than what DMA_SG is used for.
> >> >
> >> > Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
> >> > since it is not memset (or is it) I would not call it memset, or maybe we
> >> > should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
> >>
> >> I can create patches for both.
> >
> > Great, anyone who disagrees or can give better names :)
> 
> All my suggestions would involve a lot more work. If we had infinite
> time we'd stop with the per-operation-type entry points and make this
> look like a typical driver sub-system that takes commands like
> block-devices or usb, but perhaps that ship has sailed.

Can you elaborate on this :)

I have been thinking about the need to redo the API. So lets discuss :)
Dan Williams Aug. 16, 2017, 5:06 p.m. UTC | #17
On Wed, Aug 16, 2017 at 9:50 AM, Vinod Koul <vinod.koul@intel.com> wrote:
> On Thu, Aug 03, 2017 at 09:14:13AM -0700, Dan Williams wrote:
>> >> >>>>>>>>> Do we need a new API / new function, or new capability?
>> >> >>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>> >> >>>>>>>>
>> >> >>>>>>>>
>> >> >>>>>>>
>> >> >>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>> >> >>>>>>> to be similar with DMA_MEMSET_SG.
>> >> >>>>>>
>> >> >>>>>> I'm ok with that if Vinod is.
>> >> >>>>>
>> >> >>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>> >> >>>>> or all :). We should have done bitfields for this though...
>> >> >>>>
>> >> >>>> Add DMA_MEMCPY_SG to transaction type.
>> >> >>>
>> >> >>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
>> >> >>> scatterlist to scatterlist copy which is used to check for
>> >> >>> device_prep_dma_sg() calls
>> >> >>>
>> >> >> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
>> >> >> we need something separate than what DMA_SG is used for.
>> >> >
>> >> > Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
>> >> > since it is not memset (or is it) I would not call it memset, or maybe we
>> >> > should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
>> >>
>> >> I can create patches for both.
>> >
>> > Great, anyone who disagrees or can give better names :)
>>
>> All my suggestions would involve a lot more work. If we had infinite
>> time we'd stop with the per-operation-type entry points and make this
>> look like a typical driver sub-system that takes commands like
>> block-devices or usb, but perhaps that ship has sailed.
>
> Can you elaborate on this :)
>
> I have been thinking about the need to redo the API. So lets discuss :)

The high level is straightforward, the devil is in the details. Define
a generic dma command object, perhaps 'struct dma_io' certainly not
'struct dma_async_tx_descriptor', and have just one entry point
per-driver. That 'struct dma_io' would carry a generic command number
a target address and a scatterlist. The driver entry point would then
convert and build the command to the hardware command format plus
submission queue. The basic driving design principle is convert all
the current function pointer complexity with the prep_* routines into
data structure complexity in the common command format.

This trades off some efficiency because now you need to write the
generic command and write the descriptor, but I think if the operation
is worth offloading those conversion costs must already be in the
noise.
Dave Jiang Aug. 16, 2017, 5:16 p.m. UTC | #18
On 08/16/2017 10:06 AM, Dan Williams wrote:
> On Wed, Aug 16, 2017 at 9:50 AM, Vinod Koul <vinod.koul@intel.com> wrote:
>> On Thu, Aug 03, 2017 at 09:14:13AM -0700, Dan Williams wrote:
>>>>>>>>>>>>>> Do we need a new API / new function, or new capability?
>>>>>>>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>>>>>>>>>>>> to be similar with DMA_MEMSET_SG.
>>>>>>>>>>>
>>>>>>>>>>> I'm ok with that if Vinod is.
>>>>>>>>>>
>>>>>>>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>>>>>>>>>> or all :). We should have done bitfields for this though...
>>>>>>>>>
>>>>>>>>> Add DMA_MEMCPY_SG to transaction type.
>>>>>>>>
>>>>>>>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
>>>>>>>> scatterlist to scatterlist copy which is used to check for
>>>>>>>> device_prep_dma_sg() calls
>>>>>>>>
>>>>>>> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
>>>>>>> we need something separate than what DMA_SG is used for.
>>>>>>
>>>>>> Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
>>>>>> since it is not memset (or is it) I would not call it memset, or maybe we
>>>>>> should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
>>>>>
>>>>> I can create patches for both.
>>>>
>>>> Great, anyone who disagrees or can give better names :)
>>>
>>> All my suggestions would involve a lot more work. If we had infinite
>>> time we'd stop with the per-operation-type entry points and make this
>>> look like a typical driver sub-system that takes commands like
>>> block-devices or usb, but perhaps that ship has sailed.
>>
>> Can you elaborate on this :)
>>
>> I have been thinking about the need to redo the API. So lets discuss :)
> 
> The high level is straightforward, the devil is in the details. Define
> a generic dma command object, perhaps 'struct dma_io' certainly not
> 'struct dma_async_tx_descriptor', and have just one entry point
> per-driver. That 'struct dma_io' would carry a generic command number
> a target address and a scatterlist. The driver entry point would then
> convert and build the command to the hardware command format plus
> submission queue. The basic driving design principle is convert all
> the current function pointer complexity with the prep_* routines into
> data structure complexity in the common command format.
> 
> This trades off some efficiency because now you need to write the
> generic command and write the descriptor, but I think if the operation
> is worth offloading those conversion costs must already be in the
> noise.

Vinod, I think if you want to look at existing examples take a look at
the block layer request queue. Or even better blk-mq. I think this is
pretty close to what Dan is envisioning? Also, it's probably time we
looking into supporting hotplugging for DMA engines? Maybe this will
make it easier to do so. I'm willing to help and hoping that it will
make things easier for me for the next gen hardware.
Dan Williams Aug. 16, 2017, 5:20 p.m. UTC | #19
On Wed, Aug 16, 2017 at 10:16 AM, Dave Jiang <dave.jiang@intel.com> wrote:
>
>
> On 08/16/2017 10:06 AM, Dan Williams wrote:
>> On Wed, Aug 16, 2017 at 9:50 AM, Vinod Koul <vinod.koul@intel.com> wrote:
>>> On Thu, Aug 03, 2017 at 09:14:13AM -0700, Dan Williams wrote:
>>>>>>>>>>>>>>> Do we need a new API / new function, or new capability?
>>>>>>>>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>>>>>>>>>>>>> to be similar with DMA_MEMSET_SG.
>>>>>>>>>>>>
>>>>>>>>>>>> I'm ok with that if Vinod is.
>>>>>>>>>>>
>>>>>>>>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>>>>>>>>>>> or all :). We should have done bitfields for this though...
>>>>>>>>>>
>>>>>>>>>> Add DMA_MEMCPY_SG to transaction type.
>>>>>>>>>
>>>>>>>>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
>>>>>>>>> scatterlist to scatterlist copy which is used to check for
>>>>>>>>> device_prep_dma_sg() calls
>>>>>>>>>
>>>>>>>> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
>>>>>>>> we need something separate than what DMA_SG is used for.
>>>>>>>
>>>>>>> Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
>>>>>>> since it is not memset (or is it) I would not call it memset, or maybe we
>>>>>>> should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
>>>>>>
>>>>>> I can create patches for both.
>>>>>
>>>>> Great, anyone who disagrees or can give better names :)
>>>>
>>>> All my suggestions would involve a lot more work. If we had infinite
>>>> time we'd stop with the per-operation-type entry points and make this
>>>> look like a typical driver sub-system that takes commands like
>>>> block-devices or usb, but perhaps that ship has sailed.
>>>
>>> Can you elaborate on this :)
>>>
>>> I have been thinking about the need to redo the API. So lets discuss :)
>>
>> The high level is straightforward, the devil is in the details. Define
>> a generic dma command object, perhaps 'struct dma_io' certainly not
>> 'struct dma_async_tx_descriptor', and have just one entry point
>> per-driver. That 'struct dma_io' would carry a generic command number
>> a target address and a scatterlist. The driver entry point would then
>> convert and build the command to the hardware command format plus
>> submission queue. The basic driving design principle is convert all
>> the current function pointer complexity with the prep_* routines into
>> data structure complexity in the common command format.
>>
>> This trades off some efficiency because now you need to write the
>> generic command and write the descriptor, but I think if the operation
>> is worth offloading those conversion costs must already be in the
>> noise.
>
> Vinod, I think if you want to look at existing examples take a look at
> the block layer request queue. Or even better blk-mq. I think this is
> pretty close to what Dan is envisioning? Also, it's probably time we
> looking into supporting hotplugging for DMA engines? Maybe this will
> make it easier to do so. I'm willing to help and hoping that it will
> make things easier for me for the next gen hardware.

Yes, device hotplug is a good one to add to the list. We didn't have
'struct percpu_ref' when dmaengine started, that would make hotplug
support easier to handle without coarse locking.
Dave Jiang Aug. 16, 2017, 5:27 p.m. UTC | #20
On 08/16/2017 10:20 AM, Dan Williams wrote:
> On Wed, Aug 16, 2017 at 10:16 AM, Dave Jiang <dave.jiang@intel.com> wrote:
>>
>>
>> On 08/16/2017 10:06 AM, Dan Williams wrote:
>>> On Wed, Aug 16, 2017 at 9:50 AM, Vinod Koul <vinod.koul@intel.com> wrote:
>>>> On Thu, Aug 03, 2017 at 09:14:13AM -0700, Dan Williams wrote:
>>>>>>>>>>>>>>>> Do we need a new API / new function, or new capability?
>>>>>>>>>>>>>>> Hmmm...you are right. I wonder if we need something like DMA_SG cap....
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Unfortunately, DMA_SG means something else. Maybe, we need DMA_MEMCPY_SG
>>>>>>>>>>>>>> to be similar with DMA_MEMSET_SG.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I'm ok with that if Vinod is.
>>>>>>>>>>>>
>>>>>>>>>>>> So what exactly is the ask here, are you trying to do MEMCPY or SG or MEMSET
>>>>>>>>>>>> or all :). We should have done bitfields for this though...
>>>>>>>>>>>
>>>>>>>>>>> Add DMA_MEMCPY_SG to transaction type.
>>>>>>>>>>
>>>>>>>>>> Not MEMSET right, then why not use DMA_SG, DMA_SG is supposed for
>>>>>>>>>> scatterlist to scatterlist copy which is used to check for
>>>>>>>>>> device_prep_dma_sg() calls
>>>>>>>>>>
>>>>>>>>> Right. But we are doing flat buffer to/from scatterlist, not sg to sg. So
>>>>>>>>> we need something separate than what DMA_SG is used for.
>>>>>>>>
>>>>>>>> Hmm, its SG-buffer  and its memcpy, so should we call it DMA_SG_BUFFER,
>>>>>>>> since it is not memset (or is it) I would not call it memset, or maybe we
>>>>>>>> should also change DMA_SG to DMA_SG_SG to make it terribly clear :D
>>>>>>>
>>>>>>> I can create patches for both.
>>>>>>
>>>>>> Great, anyone who disagrees or can give better names :)
>>>>>
>>>>> All my suggestions would involve a lot more work. If we had infinite
>>>>> time we'd stop with the per-operation-type entry points and make this
>>>>> look like a typical driver sub-system that takes commands like
>>>>> block-devices or usb, but perhaps that ship has sailed.
>>>>
>>>> Can you elaborate on this :)
>>>>
>>>> I have been thinking about the need to redo the API. So lets discuss :)
>>>
>>> The high level is straightforward, the devil is in the details. Define
>>> a generic dma command object, perhaps 'struct dma_io' certainly not
>>> 'struct dma_async_tx_descriptor', and have just one entry point
>>> per-driver. That 'struct dma_io' would carry a generic command number
>>> a target address and a scatterlist. The driver entry point would then
>>> convert and build the command to the hardware command format plus
>>> submission queue. The basic driving design principle is convert all
>>> the current function pointer complexity with the prep_* routines into
>>> data structure complexity in the common command format.
>>>
>>> This trades off some efficiency because now you need to write the
>>> generic command and write the descriptor, but I think if the operation
>>> is worth offloading those conversion costs must already be in the
>>> noise.
>>
>> Vinod, I think if you want to look at existing examples take a look at
>> the block layer request queue. Or even better blk-mq. I think this is
>> pretty close to what Dan is envisioning? Also, it's probably time we
>> looking into supporting hotplugging for DMA engines? Maybe this will
>> make it easier to do so. I'm willing to help and hoping that it will
>> make things easier for me for the next gen hardware.
> 
> Yes, device hotplug is a good one to add to the list. We didn't have
> 'struct percpu_ref' when dmaengine started, that would make hotplug
> support easier to handle without coarse locking.
> 

And also perhaps hw queues / channels hotplug. Future hardware may have
reconfigurable queues that can be dynamic in numbers.
Vinod Koul Aug. 18, 2017, 5:35 a.m. UTC | #21
On Wed, Aug 16, 2017 at 10:16:31AM -0700, Dave Jiang wrote:
> 
> 
> On 08/16/2017 10:06 AM, Dan Williams wrote:
> > On Wed, Aug 16, 2017 at 9:50 AM, Vinod Koul <vinod.koul@intel.com> wrote:
> >> On Thu, Aug 03, 2017 at 09:14:13AM -0700, Dan Williams wrote:

> >>> All my suggestions would involve a lot more work. If we had infinite
> >>> time we'd stop with the per-operation-type entry points and make this
> >>> look like a typical driver sub-system that takes commands like
> >>> block-devices or usb, but perhaps that ship has sailed.
> >>
> >> Can you elaborate on this :)
> >>
> >> I have been thinking about the need to redo the API. So lets discuss :)
> > 
> > The high level is straightforward, the devil is in the details. Define
> > a generic dma command object, perhaps 'struct dma_io' certainly not
> > 'struct dma_async_tx_descriptor', and have just one entry point
> > per-driver. That 'struct dma_io' would carry a generic command number
> > a target address and a scatterlist. The driver entry point would then
> > convert and build the command to the hardware command format plus
> > submission queue. The basic driving design principle is convert all
> > the current function pointer complexity with the prep_* routines into
> > data structure complexity in the common command format.
> > 
> > This trades off some efficiency because now you need to write the
> > generic command and write the descriptor, but I think if the operation
> > is worth offloading those conversion costs must already be in the
> > noise.

yes the trade off IMO seems a good one actually. In the hindsight the
proliferation of prep_* wasn't a good one. Also for request processing I
feel lot more burden should be undertaken by dmaengine core rather than
drivers, we should just give requests to driver. The descriptors should be
managed in the core...

To ease the change, we can create the new API and not accept drivers to old
API and start moving off drivers one bit at a time. Yes that will take a lot
of time but the conversion will be less painful hopefully...

> Vinod, I think if you want to look at existing examples take a look at
> the block layer request queue. Or even better blk-mq. I think this is
> pretty close to what Dan is envisioning? Also, it's probably time we
> looking into supporting hotplugging for DMA engines? Maybe this will
> make it easier to do so. I'm willing to help and hoping that it will
> make things easier for me for the next gen hardware.

Okay i will look it up. Thanks for the help, we need all the help here :)

> --
> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 98e752f..c8f7a2f 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -32,6 +32,8 @@ 
 #include <linux/dax.h>
 #include <linux/nd.h>
 #include <linux/blk-mq.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
 #include "pmem.h"
 #include "pfn.h"
 #include "nd.h"
@@ -41,12 +43,24 @@  enum {
 	PMEM_Q_MQ = 1,
 };
 
-static int queue_mode = PMEM_Q_BIO;
+static int queue_mode = PMEM_Q_MQ;
 module_param(queue_mode, int, 0444);
-MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ)");
+MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode (0=BIO, 1=BLK-MQ & DMA)");
+
+static int queue_depth = 128;
+module_param(queue_depth, int, 0444);
+MODULE_PARM_DESC(queue_depth, "I/O Queue Depth for multi queue mode");
+
+/* typically maps to number of DMA channels/devices per socket */
+static int q_per_node = 8;
+module_param(q_per_node, int, 0444);
+MODULE_PARM_DESC(q_per_node, "Hardware queues per node\n");
 
 struct pmem_cmd {
 	struct request *rq;
+	struct dma_chan *chan;
+	int sg_nents;
+	struct scatterlist sg[];
 };
 
 static struct device *to_dev(struct pmem_device *pmem)
@@ -277,6 +291,159 @@  static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
+static void nd_pmem_dma_callback(void *data,
+		const struct dmaengine_result *res)
+{
+	struct pmem_cmd *cmd = data;
+	struct request *req = cmd->rq;
+	struct request_queue *q = req->q;
+	struct pmem_device *pmem = q->queuedata;
+	struct nd_region *nd_region = to_region(pmem);
+	struct device *dev = to_dev(pmem);
+	int rc = 0;
+
+	if (res) {
+		enum dmaengine_tx_result dma_err = res->result;
+
+		switch (dma_err) {
+		case DMA_TRANS_READ_FAILED:
+		case DMA_TRANS_WRITE_FAILED:
+		case DMA_TRANS_ABORTED:
+			dev_dbg(dev, "bio failed\n");
+			rc = -ENXIO;
+			break;
+		case DMA_TRANS_NOERROR:
+		default:
+			break;
+		}
+	}
+
+	if (req->cmd_flags & REQ_FUA)
+		nvdimm_flush(nd_region);
+
+	blk_mq_end_request(cmd->rq, rc);
+}
+
+static int pmem_handle_cmd_dma(struct pmem_cmd *cmd, bool is_write)
+{
+	struct request *req = cmd->rq;
+	struct request_queue *q = req->q;
+	struct pmem_device *pmem = q->queuedata;
+	struct device *dev = to_dev(pmem);
+	phys_addr_t pmem_off = blk_rq_pos(req) * 512 + pmem->data_offset;
+	void *pmem_addr = pmem->virt_addr + pmem_off;
+	struct nd_region *nd_region = to_region(pmem);
+	size_t len;
+	struct dma_device *dma = cmd->chan->device;
+	struct dmaengine_unmap_data *unmap;
+	dma_cookie_t cookie;
+	struct dma_async_tx_descriptor *txd;
+	struct page *page;
+	unsigned int off;
+	int rc;
+	enum dma_data_direction dir;
+	dma_addr_t dma_addr;
+
+	if (req->cmd_flags & REQ_FLUSH)
+		nvdimm_flush(nd_region);
+
+	unmap = dmaengine_get_unmap_data(dma->dev, 2, GFP_NOWAIT);
+	if (!unmap) {
+		dev_dbg(dev, "failed to get dma unmap data\n");
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * If reading from pmem, writing to scatterlist,
+	 * and if writing to pmem, reading from scatterlist.
+	 */
+	dir = is_write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	cmd->sg_nents = blk_rq_map_sg(req->q, req, cmd->sg);
+	if (cmd->sg_nents < 1) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	if (cmd->sg_nents > 128) {
+		rc = -ENOMEM;
+		dev_warn(dev, "Number of sg greater than allocated\n");
+		goto err;
+	}
+
+	rc = dma_map_sg(dma->dev, cmd->sg, cmd->sg_nents, dir);
+	if (rc < 1) {
+		rc = -ENXIO;
+		goto err;
+	}
+
+	len = blk_rq_payload_bytes(req);
+	page = virt_to_page(pmem_addr);
+	off = offset_in_page(pmem_addr);
+	dir = is_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+	dma_addr = dma_map_page(dma->dev, page, off, len, dir);
+	if (dma_mapping_error(dma->dev, unmap->addr[0])) {
+		dev_dbg(dma->dev, "src DMA mapping error\n");
+		rc = -ENXIO;
+		goto err_unmap_sg;
+	}
+
+	unmap->len = len;
+
+	if (is_write) {
+		unmap->addr[0] = dma_addr;
+		unmap->addr[1] = (dma_addr_t)cmd->sg;
+		unmap->to_cnt = 1;
+		unmap->from_cnt = 0;
+		dma_unmap_data_sg_from_nents(unmap, 2) = cmd->sg_nents;
+	} else {
+		unmap->addr[0] = (dma_addr_t)cmd->sg;
+		unmap->addr[1] = dma_addr;
+		unmap->from_cnt = 1;
+		unmap->to_cnt = 0;
+		dma_unmap_data_sg_to_nents(unmap, 2) = cmd->sg_nents;
+	}
+
+	txd = dma->device_prep_dma_memcpy_sg(cmd->chan,
+				cmd->sg, cmd->sg_nents, dma_addr,
+				!is_write, DMA_PREP_INTERRUPT);
+	if (!txd) {
+		dev_dbg(dma->dev, "dma prep failed\n");
+		rc = -ENXIO;
+		goto err_unmap_buffer;
+	}
+
+	txd->callback_result = nd_pmem_dma_callback;
+	txd->callback_param = cmd;
+	dma_set_unmap(txd, unmap);
+	cookie = dmaengine_submit(txd);
+	if (dma_submit_error(cookie)) {
+		dev_dbg(dma->dev, "dma submit error\n");
+		rc = -ENXIO;
+		goto err_set_unmap;
+	}
+
+	dmaengine_unmap_put(unmap);
+	dma_async_issue_pending(cmd->chan);
+
+	return 0;
+
+err_set_unmap:
+	dmaengine_unmap_put(unmap);
+err_unmap_buffer:
+	dma_unmap_page(dev, dma_addr, len, dir);
+err_unmap_sg:
+	if (dir == DMA_TO_DEVICE)
+		dir = DMA_FROM_DEVICE;
+	else
+		dir = DMA_TO_DEVICE;
+	dma_unmap_sg(dev, cmd->sg, cmd->sg_nents, dir);
+	dmaengine_unmap_put(unmap);
+err:
+	blk_mq_end_request(cmd->rq, -ENXIO);
+	return rc;
+}
+
 static int pmem_handle_cmd(struct pmem_cmd *cmd)
 {
 	struct request *req = cmd->rq;
@@ -310,12 +477,18 @@  static int pmem_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
 	struct pmem_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+	struct request *req;
+	int rc;
 
-	cmd->rq = bd->rq;
-
-	blk_mq_start_request(bd->rq);
+	req = cmd->rq = bd->rq;
+	cmd->chan = dma_find_channel(DMA_MEMCPY);
+	blk_mq_start_request(req);
 
-	if (pmem_handle_cmd(cmd) < 0)
+	if (cmd->chan && bio_multiple_segments(req->bio))
+		rc = pmem_handle_cmd_dma(cmd, op_is_write(req_op(req)));
+	else
+		rc = pmem_handle_cmd(cmd);
+	if (rc < 0)
 		return BLK_MQ_RQ_QUEUE_ERROR;
 	else
 		return BLK_MQ_RQ_QUEUE_OK;
@@ -341,6 +514,7 @@  static int pmem_attach_disk(struct device *dev,
 	struct gendisk *disk;
 	void *addr;
 	int rc;
+	struct dma_chan *chan;
 
 	/* while nsio_rw_bytes is active, parse a pfn info block if present */
 	if (is_nd_pfn(dev)) {
@@ -370,11 +544,20 @@  static int pmem_attach_disk(struct device *dev,
 	}
 
 	if (queue_mode == PMEM_Q_MQ) {
+		chan = dma_find_channel(DMA_MEMCPY);
+		if (!chan) {
+			queue_mode = PMEM_Q_BIO;
+			dev_warn(dev, "Forced back to PMEM_Q_BIO, no DMA\n");
+		}
+	}
+
+	if (queue_mode == PMEM_Q_MQ) {
 		pmem->tag_set.ops = &pmem_mq_ops;
-		pmem->tag_set.nr_hw_queues = nr_online_nodes;
-		pmem->tag_set.queue_depth = 64;
+		pmem->tag_set.nr_hw_queues = nr_online_nodes * q_per_node;
+		pmem->tag_set.queue_depth = queue_depth;
 		pmem->tag_set.numa_node = dev_to_node(dev);
-		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd);
+		pmem->tag_set.cmd_size = sizeof(struct pmem_cmd) +
+			sizeof(struct scatterlist) * 128;
 		pmem->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 		pmem->tag_set.driver_data = pmem;
 
@@ -437,7 +620,11 @@  static int pmem_attach_disk(struct device *dev,
 
 	blk_queue_write_cache(pmem->q, true, true);
 	blk_queue_physical_block_size(pmem->q, PAGE_SIZE);
-	blk_queue_max_hw_sectors(pmem->q, UINT_MAX);
+	if (queue_mode == PMEM_Q_MQ) {
+		blk_queue_max_hw_sectors(pmem->q, 0x200000);
+		blk_queue_max_segments(pmem->q, 128);
+	} else
+		blk_queue_max_hw_sectors(pmem->q, UINT_MAX);
 	blk_queue_bounce_limit(pmem->q, BLK_BOUNCE_ANY);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->q);
 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, pmem->q);
@@ -576,15 +763,22 @@  static struct nd_device_driver nd_pmem_driver = {
 
 static int __init pmem_init(void)
 {
+	if (queue_mode == PMEM_Q_MQ)
+		dmaengine_get();
+
 	return nd_driver_register(&nd_pmem_driver);
 }
 module_init(pmem_init);
 
 static void pmem_exit(void)
 {
+	if (queue_mode == PMEM_Q_MQ)
+		dmaengine_put();
+
 	driver_unregister(&nd_pmem_driver.drv);
 }
 module_exit(pmem_exit);
 
+MODULE_SOFTDEP("pre: dmaengine");
 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
 MODULE_LICENSE("GPL v2");