diff mbox

[11/11] nvme: add support for streams and directives

Message ID 1497633883-21230-12-git-send-email-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe June 16, 2017, 5:24 p.m. UTC
This adds support for Directives in NVMe, particular for the Streams
directive. Support for Directives is a new feature in NVMe 1.3. It
allows a user to pass in information about where to store the data,
so that it the device can do so most effiently. If an application is
managing and writing data with different life times, mixing differently
retentioned data onto the same locations on flash can cause write
amplification to grow. This, in turn, will reduce performance and
life time of the device.

We default to allocating 4 streams per name space, but it is
configurable with the 'streams_per_ns' module option. If a write stream
is set in a write, flag is as such before sending it to the device. The
streams are allocated lazily - if we get a write request with a life
time hint, then background allocate streams and use them once that
is done.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 168 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |   5 ++
 include/linux/nvme.h     |  48 ++++++++++++++
 3 files changed, 221 insertions(+)

Comments

Christoph Hellwig June 16, 2017, 6:09 p.m. UTC | #1
> We default to allocating 4 streams per name space, but it is
> configurable with the 'streams_per_ns' module option.

What's your strategy for multi-namespace devices?  You won't even get
your 4 streams for a handful namespaces with devices today or the near
future.  Should we cut down the number of streams per namespace?  Or
just not set aside streams for namespace exclusive use?  Or so far
you simply don't care about the multi-ns case?

> +static void nvme_write_hint_work(struct work_struct *work)
> +{
	struct nvme_ns *ns = container_of(work, struct nvme_ns, write_hint_work);

nasty 81 character line :)

> +	nr_streams = streams_per_ns;
> +	if (nr_streams > ns->ctrl->nssa)
> +		nr_streams = ns->ctrl->nssa;

min() ?

> +static enum rw_hint nvme_get_write_stream(struct nvme_ns *ns,
> +					   struct request *req)
> +{
> +	enum rw_hint streamid = req->cmd_flags & REQ_WRITE_LIFE_MASK;
> +
> +	if (req_op(req) != REQ_OP_WRITE ||
> +	    !(ns->ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
> +		return WRITE_LIFE_NONE;

How about moving this to the caller?

> +	/*
> +	 * If we support streams and this request is a write with a valid
> +	 * hint, then flag it as such. If we haven't allocated streams on
> +	 * this ns before, do so lazily.
> +	 */
> +	stream = nvme_get_write_stream(ns, req);
> +	if (stream != WRITE_LIFE_NONE) {
> +		if (ns->nr_streams) {
> +			control |= NVME_RW_DTYPE_STREAMS;
> +			dsmgmt |= (stream << 16);
> +		} else
> +			nvme_configure_streams(ns);
> +	}

.. and instead pass control and dsmgmt to nvme_get_write_stream by
reference to isolate the functionality there.  And move the
nvme_configure_streams call into it as well.


Last but not least this will need some tweaks in the deallocate
code due to:

"If the host issues a Dataset Management command to deallocate logical
 blocks that are associated with a stream, it should specify a starting
 LBA and length that is aligned to and in multiples of the Stream
 Granularity Size"
Jens Axboe June 16, 2017, 7:41 p.m. UTC | #2
On 06/16/2017 12:09 PM, Christoph Hellwig wrote:
>> We default to allocating 4 streams per name space, but it is
>> configurable with the 'streams_per_ns' module option.
> 
> What's your strategy for multi-namespace devices?  You won't even get
> your 4 streams for a handful namespaces with devices today or the near
> future.  Should we cut down the number of streams per namespace?  Or
> just not set aside streams for namespace exclusive use?  Or so far
> you simply don't care about the multi-ns case?

I'm assuming most devices will have 8 streams, or 16. So for 2 or 4
namespaces, we'd be fine.

But honestly, I simply don't care too much about it, for a few reasons:

1) I don't know of anyone that uses name spaces to split up a device,
   except for places where the want integrity on one and not the other.
   And I haven't even seen that.

2) If you do have multiple name spaces, you get some data separation
   through that. Or not, depending on the device.

In any case, we'll just have to divide up the streams. Right now we just
allocate 4 in a first-come first-serve basis. If you have more name
spaces than streams/4, then some name spaces don't get streams. We could
make this more fair and do:

streams_per_ns = ctrl->nssa / num_namespaces;

I'm not sure what the best answer is here, or if there even is one known
righ answer for this, as it will depend on the use case. That's
something we can change down the line without impacting anything, so I'm
not too worried about that.

>> +static void nvme_write_hint_work(struct work_struct *work)
>> +{
> 	struct nvme_ns *ns = container_of(work, struct nvme_ns, write_hint_work);
> 
> nasty 81 character line :)

Dear lord, I'll fix that up.

>> +	nr_streams = streams_per_ns;
>> +	if (nr_streams > ns->ctrl->nssa)
>> +		nr_streams = ns->ctrl->nssa;
> 
> min() ?

Yep, let's use that.

>> +static enum rw_hint nvme_get_write_stream(struct nvme_ns *ns,
>> +					   struct request *req)
>> +{
>> +	enum rw_hint streamid = req->cmd_flags & REQ_WRITE_LIFE_MASK;
>> +
>> +	if (req_op(req) != REQ_OP_WRITE ||
>> +	    !(ns->ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
>> +		return WRITE_LIFE_NONE;
> 
> How about moving this to the caller?
> 
>> +	/*
>> +	 * If we support streams and this request is a write with a valid
>> +	 * hint, then flag it as such. If we haven't allocated streams on
>> +	 * this ns before, do so lazily.
>> +	 */
>> +	stream = nvme_get_write_stream(ns, req);
>> +	if (stream != WRITE_LIFE_NONE) {
>> +		if (ns->nr_streams) {
>> +			control |= NVME_RW_DTYPE_STREAMS;
>> +			dsmgmt |= (stream << 16);
>> +		} else
>> +			nvme_configure_streams(ns);
>> +	}
> 
> .. and instead pass control and dsmgmt to nvme_get_write_stream by
> reference to isolate the functionality there.  And move the
> nvme_configure_streams call into it as well.

OK, I can make those two changes, fine with me.

> Last but not least this will need some tweaks in the deallocate
> code due to:
> 
> "If the host issues a Dataset Management command to deallocate logical
>  blocks that are associated with a stream, it should specify a starting
>  LBA and length that is aligned to and in multiples of the Stream
>  Granularity Size"

Do we use that internally?
Christoph Hellwig June 17, 2017, 12:21 p.m. UTC | #3
On Fri, Jun 16, 2017 at 01:41:36PM -0600, Jens Axboe wrote:
> But honestly, I simply don't care too much about it, for a few reasons:
> 
> 1) I don't know of anyone that uses name spaces to split up a device,
>    except for places where the want integrity on one and not the other.
>    And I haven't even seen that.
> 
> 2) If you do have multiple name spaces, you get some data separation
>    through that. Or not, depending on the device.
> 
> In any case, we'll just have to divide up the streams. Right now we just
> allocate 4 in a first-come first-serve basis. If you have more name
> spaces than streams/4, then some name spaces don't get streams. We could
> make this more fair and do:

Or just not reserve streams for exclusive use of the namespace and
use them directly from the global pool.  Especially if you ever want
to use streams with shared disk filesystems or databases you'd have
to do that anyway.  And due to our hardcoded streams values that
will in fact work nicely.  And it would get rid of the lazy setup
code as well.

> > "If the host issues a Dataset Management command to deallocate logical
> >  blocks that are associated with a stream, it should specify a starting
> >  LBA and length that is aligned to and in multiples of the Stream
> >  Granularity Size"
> 
> Do we use that internally?

I don't understand the comment.  If we issue a deallocate (discard)
we should align it to the Stream Granularity Size as exposed by the
device, easy enough.  While we're at it we should probably also
expose the Stream Write Size as io_min and the Stream Granularity Size
as io_opt in our I/O topology information while at it.
Jens Axboe June 17, 2017, 2:20 p.m. UTC | #4
On 06/17/2017 06:21 AM, Christoph Hellwig wrote:
> On Fri, Jun 16, 2017 at 01:41:36PM -0600, Jens Axboe wrote:
>> But honestly, I simply don't care too much about it, for a few reasons:
>>
>> 1) I don't know of anyone that uses name spaces to split up a device,
>>    except for places where the want integrity on one and not the other.
>>    And I haven't even seen that.
>>
>> 2) If you do have multiple name spaces, you get some data separation
>>    through that. Or not, depending on the device.
>>
>> In any case, we'll just have to divide up the streams. Right now we just
>> allocate 4 in a first-come first-serve basis. If you have more name
>> spaces than streams/4, then some name spaces don't get streams. We could
>> make this more fair and do:
> 
> Or just not reserve streams for exclusive use of the namespace and
> use them directly from the global pool.  Especially if you ever want
> to use streams with shared disk filesystems or databases you'd have
> to do that anyway.  And due to our hardcoded streams values that
> will in fact work nicely.  And it would get rid of the lazy setup
> code as well.

We can certainly go that route. So you'd be fine with allocating 4
streams controller wide by default, and dump the lazy alloc? We can make
this depend on the streams module parameter, so people could turn it
off, if needed.

>>> "If the host issues a Dataset Management command to deallocate logical
>>>  blocks that are associated with a stream, it should specify a starting
>>>  LBA and length that is aligned to and in multiples of the Stream
>>>  Granularity Size"
>>
>> Do we use that internally?
> 
> I don't understand the comment.  If we issue a deallocate (discard)
> we should align it to the Stream Granularity Size as exposed by the
> device, easy enough.  While we're at it we should probably also
> expose the Stream Write Size as io_min and the Stream Granularity Size
> as io_opt in our I/O topology information while at it.

OK, that's easy enough to do, if we enable streams on a controller basis
at probe time.
Christoph Hellwig June 17, 2017, 3:03 p.m. UTC | #5
On Sat, Jun 17, 2017 at 08:20:01AM -0600, Jens Axboe wrote:
> We can certainly go that route. So you'd be fine with allocating 4
> streams controller wide by default, and dump the lazy alloc? We can make
> this depend on the streams module parameter, so people could turn it
> off, if needed.

We don't even need to allocate the streams - streams are implicitly
allocated on first use:

"Stream resources that are not allocated for the exclusive use of any
 namespace are available NVM subsystem stream resources as reported in
 NVM Subsystem Streams Available (NSSA) and may be used by any namespace
 that has the Streams Directive enabled and has not been allocated
 exclusive stream resources in response to an Allocate Resources
 operation."

so the only thing you need to do is to enable streams on the namespace,
or just for whole whole controller using nsid=0xffffffff
Jens Axboe June 17, 2017, 3:11 p.m. UTC | #6
On 06/17/2017 09:03 AM, Christoph Hellwig wrote:
> On Sat, Jun 17, 2017 at 08:20:01AM -0600, Jens Axboe wrote:
>> We can certainly go that route. So you'd be fine with allocating 4
>> streams controller wide by default, and dump the lazy alloc? We can make
>> this depend on the streams module parameter, so people could turn it
>> off, if needed.
> 
> We don't even need to allocate the streams - streams are implicitly
> allocated on first use:
> 
> "Stream resources that are not allocated for the exclusive use of any
>  namespace are available NVM subsystem stream resources as reported in
>  NVM Subsystem Streams Available (NSSA) and may be used by any namespace
>  that has the Streams Directive enabled and has not been allocated
>  exclusive stream resources in response to an Allocate Resources
>  operation."
> 
> so the only thing you need to do is to enable streams on the namespace,
> or just for whole whole controller using nsid=0xffffffff

I have two samples here, and I just tested, and both of them want it
assigned with nsid=0xffffffff or they will fail the writes... So I'd say
we're better off ensuring we do allocate them globally.

For the IO limits, are you sure you want those exposed? SWS is a write
setting, exposing it as io_min may not make sense. For discard,
SWS * SGS * block_size ends up being huge, but I guess at least that
setting does make sense.
Jens Axboe June 17, 2017, 3:43 p.m. UTC | #7
On 06/17/2017 09:11 AM, Jens Axboe wrote:
> On 06/17/2017 09:03 AM, Christoph Hellwig wrote:
>> On Sat, Jun 17, 2017 at 08:20:01AM -0600, Jens Axboe wrote:
>>> We can certainly go that route. So you'd be fine with allocating 4
>>> streams controller wide by default, and dump the lazy alloc? We can make
>>> this depend on the streams module parameter, so people could turn it
>>> off, if needed.
>>
>> We don't even need to allocate the streams - streams are implicitly
>> allocated on first use:
>>
>> "Stream resources that are not allocated for the exclusive use of any
>>  namespace are available NVM subsystem stream resources as reported in
>>  NVM Subsystem Streams Available (NSSA) and may be used by any namespace
>>  that has the Streams Directive enabled and has not been allocated
>>  exclusive stream resources in response to an Allocate Resources
>>  operation."
>>
>> so the only thing you need to do is to enable streams on the namespace,
>> or just for whole whole controller using nsid=0xffffffff
> 
> I have two samples here, and I just tested, and both of them want it
> assigned with nsid=0xffffffff or they will fail the writes... So I'd say
> we're better off ensuring we do allocate them globally.
> 
> For the IO limits, are you sure you want those exposed? SWS is a write
> setting, exposing it as io_min may not make sense. For discard,
> SWS * SGS * block_size ends up being huge, but I guess at least that
> setting does make sense.

http://git.kernel.dk/cgit/linux-block/commit/?h=write-stream&id=7c36ee9b7da9fdd26a9663870f49a30ca34d36af

I think that should cover all the comments.
Christoph Hellwig June 19, 2017, 6:25 a.m. UTC | #8
On Sat, Jun 17, 2017 at 09:11:30AM -0600, Jens Axboe wrote:
> I have two samples here, and I just tested, and both of them want it
> assigned with nsid=0xffffffff or they will fail the writes... So I'd say
> we're better off ensuring we do allocate them globally.

That's clearly against the spec.  I'd say go to your vendor and get
a refund, as we Linux folks (Martin and I) fought for this so that
we would not have to do the explicit allocations.

Another quote for from the spec:

"Streams are opened by the controller when the host issues a write
 command that specifies a stream identifier that is not currently open.
 While a stream is open the controller maintains context for that stream
 (e.g., buffers for associated data). The host may determine the streams 
 that are open using the Get Status operation."

And I think this is very important - otherwise you need to either
allocate the stremas beforehand as your earlier patches (and we take
away the resources from the 99% of the users not using write life
hints), or we need the lazy allocation scheme.  And for that to be
efficient it probably needs to be lazy per-stream allocation.  That's
why we got the implicit open in after all.
Jens Axboe June 19, 2017, 2:31 p.m. UTC | #9
On 06/19/2017 12:25 AM, Christoph Hellwig wrote:
> On Sat, Jun 17, 2017 at 09:11:30AM -0600, Jens Axboe wrote:
>> I have two samples here, and I just tested, and both of them want it
>> assigned with nsid=0xffffffff or they will fail the writes... So I'd say
>> we're better off ensuring we do allocate them globally.
> 
> That's clearly against the spec.  I'd say go to your vendor and get
> a refund, as we Linux folks (Martin and I) fought for this so that
> we would not have to do the explicit allocations.
> 
> Another quote for from the spec:
> 
> "Streams are opened by the controller when the host issues a write
>  command that specifies a stream identifier that is not currently open.
>  While a stream is open the controller maintains context for that stream
>  (e.g., buffers for associated data). The host may determine the streams 
>  that are open using the Get Status operation."
> 
> And I think this is very important - otherwise you need to either
> allocate the stremas beforehand as your earlier patches (and we take
> away the resources from the 99% of the users not using write life
> hints), or we need the lazy allocation scheme.  And for that to be
> efficient it probably needs to be lazy per-stream allocation.  That's
> why we got the implicit open in after all.

These are just samples, so no refund possible! As you might remember,
I was pretty adamant on not wanting explicit open/close as well, back
in those days. I'll check with the vendor and see what's going on.
Jens Axboe June 19, 2017, 2:53 p.m. UTC | #10
On 06/19/2017 08:31 AM, Jens Axboe wrote:
> On 06/19/2017 12:25 AM, Christoph Hellwig wrote:
>> On Sat, Jun 17, 2017 at 09:11:30AM -0600, Jens Axboe wrote:
>>> I have two samples here, and I just tested, and both of them want it
>>> assigned with nsid=0xffffffff or they will fail the writes... So I'd say
>>> we're better off ensuring we do allocate them globally.
>>
>> That's clearly against the spec.  I'd say go to your vendor and get
>> a refund, as we Linux folks (Martin and I) fought for this so that
>> we would not have to do the explicit allocations.
>>
>> Another quote for from the spec:
>>
>> "Streams are opened by the controller when the host issues a write
>>  command that specifies a stream identifier that is not currently open.
>>  While a stream is open the controller maintains context for that stream
>>  (e.g., buffers for associated data). The host may determine the streams 
>>  that are open using the Get Status operation."
>>
>> And I think this is very important - otherwise you need to either
>> allocate the stremas beforehand as your earlier patches (and we take
>> away the resources from the 99% of the users not using write life
>> hints), or we need the lazy allocation scheme.  And for that to be
>> efficient it probably needs to be lazy per-stream allocation.  That's
>> why we got the implicit open in after all.
> 
> These are just samples, so no refund possible! As you might remember,
> I was pretty adamant on not wanting explicit open/close as well, back
> in those days. I'll check with the vendor and see what's going on.

Looking at it a bit more closely - there's a difference between
assigning X number of streams (allocating) for use by the subsystem or
per-ns, and having to manually open them. So I don't necessarily think
there's a problem here, neither for us or on the device.

If I load the driver as it currently stands, and get streams params
after load:

	Number of Open Streams in NVM subsystem  (NOSNS): 0
	Allocated Stream Resources                 (ASR): 4
	Number of Open Streams in Namespace       (NOSN): 0

4 streams allocated, 0 currently open. Then I generate a single write to
some stream ID:

	Number of Open Streams in NVM subsystem  (NOSNS): 1
	Allocated Stream Resources                 (ASR): 4
	Number of Open Streams in Namespace       (NOSN): 1

and we know see a single stream actually open, implicitly, by the
device. Fire off another single stream write, this time to another
stream, and we get:

	Number of Open Streams in NVM subsystem  (NOSNS): 2
	Allocated Stream Resources                 (ASR): 4
	Number of Open Streams in Namespace       (NOSN): 2

and stream status correctly reflects that I did a WRITE_HINT_SHORT and
WRITE_HINT_LONG write:

	Open Stream Count  : 2
	Stream Identifier 000001 : 1
	Stream Identifier 000002 : 3

Awaiting clarification from the vendor what their position/view of this
is. Reading the spec, I do agree that it leans towards only needing
allocation for a specific name space, but it doesn't explicitly say that
you can use any of the available streams, without allocation, if they
haven't been assigned to a specific name space. I would interpret it
that way, though.
Christoph Hellwig June 19, 2017, 6:53 p.m. UTC | #11
On Mon, Jun 19, 2017 at 08:53:08AM -0600, Jens Axboe wrote:
> Looking at it a bit more closely - there's a difference between
> assigning X number of streams (allocating) for use by the subsystem or
> per-ns, and having to manually open them. So I don't necessarily think
> there's a problem here, neither for us or on the device.

As far as I can tell the allocate resource is only supposed to be
for an individual namespace:

Section 9.3:

"Stream resources may be allocated for the exclusive use of a specified
 namespace associated with a particular Host Identifier using the Allocate
 Resources operation. Stream resources that are not allocated for the
 exclusive use of any namespace are available NVM subsystem stream
 resources as reported in NVM Subsystem Streams Available (NSSA) and may
 be used by any namespace that has the Streams Directive enabled and has
 not been allocated exclusive stream resources in response to an
 Allocate Resources operation"

Section 9.3.1.3:

"The Allocate Resources operation indicates the number of streams that the host requests for the exclusive use of the specified namespace."

I think this is pretty clear, but if not it would be good if you could
bring it into the working group so that we can clarify it further.

> Awaiting clarification from the vendor what their position/view of this
> is. Reading the spec, I do agree that it leans towards only needing
> allocation for a specific name space, but it doesn't explicitly say that
> you can use any of the available streams, without allocation, if they
> haven't been assigned to a specific name space. I would interpret it
> that way, though.

Let's get some text into an ECN either way to clarify it..
Jens Axboe June 19, 2017, 7:03 p.m. UTC | #12
On 06/19/2017 12:53 PM, Christoph Hellwig wrote:
> On Mon, Jun 19, 2017 at 08:53:08AM -0600, Jens Axboe wrote:
>> Looking at it a bit more closely - there's a difference between
>> assigning X number of streams (allocating) for use by the subsystem or
>> per-ns, and having to manually open them. So I don't necessarily think
>> there's a problem here, neither for us or on the device.
> 
> As far as I can tell the allocate resource is only supposed to be
> for an individual namespace:
> 
> Section 9.3:
> 
> "Stream resources may be allocated for the exclusive use of a specified
>  namespace associated with a particular Host Identifier using the Allocate
>  Resources operation. Stream resources that are not allocated for the
>  exclusive use of any namespace are available NVM subsystem stream
>  resources as reported in NVM Subsystem Streams Available (NSSA) and may
>  be used by any namespace that has the Streams Directive enabled and has
>  not been allocated exclusive stream resources in response to an
>  Allocate Resources operation"
> 
> Section 9.3.1.3:
> 
> "The Allocate Resources operation indicates the number of streams that the host requests for the exclusive use of the specified namespace."
> 
> I think this is pretty clear, but if not it would be good if you could
> bring it into the working group so that we can clarify it further.

I'll wait until I hear back from the manufacturer, it could just be an
oversight on their end. But yes, the language should be clarified.

>> Awaiting clarification from the vendor what their position/view of this
>> is. Reading the spec, I do agree that it leans towards only needing
>> allocation for a specific name space, but it doesn't explicitly say that
>> you can use any of the available streams, without allocation, if they
>> haven't been assigned to a specific name space. I would interpret it
>> that way, though.
> 
> Let's get some text into an ECN either way to clarify it..

Thanks
diff mbox

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 903d5813023a..48a5acea5105 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -65,6 +65,10 @@  static bool force_apst;
 module_param(force_apst, bool, 0644);
 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
 
+static char streams_per_ns = 4;
+module_param(streams_per_ns, byte, 0644);
+MODULE_PARM_DESC(streams_per_ns, "if available, allocate this many streams per NS");
+
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
@@ -331,9 +335,152 @@  static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+
+	c.directive.opcode = nvme_admin_directive_send;
+	c.directive.nsid = cpu_to_le32(0xffffffff);
+	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+	c.directive.dtype = NVME_DIR_IDENTIFY;
+	c.directive.tdtype = NVME_DIR_STREAMS;
+	c.directive.endir = NVME_DIR_ENDIR;
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
+}
+
+static int nvme_probe_directives(struct nvme_ctrl *ctrl)
+{
+	struct streams_directive_params s;
+	struct nvme_command c;
+	int ret;
+
+	if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+		return 0;
+
+	ret = nvme_enable_streams(ctrl);
+	if (ret)
+		return ret;
+
+	memset(&c, 0, sizeof(c));
+	memset(&s, 0, sizeof(s));
+
+	c.directive.opcode = nvme_admin_directive_recv;
+	c.directive.nsid = cpu_to_le32(0xffffffff);
+	c.directive.numd = sizeof(s);
+	c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+	c.directive.dtype = NVME_DIR_STREAMS;
+
+	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, &s, sizeof(s));
+	if (ret)
+		return ret;
+
+	ctrl->nssa = le16_to_cpu(s.nssa);
+	return 0;
+}
+
+/*
+ * Returns number of streams allocated for use by this ns, or -1 on error.
+ */
+static int nvme_streams_allocate(struct nvme_ns *ns, unsigned int streams)
+{
+	struct nvme_command c;
+	union nvme_result res;
+	int ret;
+
+	memset(&c, 0, sizeof(c));
+
+	c.directive.opcode = nvme_admin_directive_recv;
+	c.directive.nsid = cpu_to_le32(ns->ns_id);
+	c.directive.doper = NVME_DIR_RCV_ST_OP_RESOURCE;
+	c.directive.dtype = NVME_DIR_STREAMS;
+	c.directive.endir = streams;
+
+	ret = __nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, &res, NULL, 0, 0,
+					NVME_QID_ANY, 0, 0);
+	if (ret)
+		return -1;
+
+	return le32_to_cpu(res.u32) & 0xffff;
+}
+
+static int nvme_streams_deallocate(struct nvme_ns *ns)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+
+	c.directive.opcode = nvme_admin_directive_send;
+	c.directive.nsid = cpu_to_le32(ns->ns_id);
+	c.directive.doper = NVME_DIR_SND_ST_OP_REL_RSC;
+	c.directive.dtype = NVME_DIR_STREAMS;
+
+	return nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0);
+}
+
+static void nvme_write_hint_work(struct work_struct *work)
+{
+	struct nvme_ns *ns = container_of(work, struct nvme_ns, write_hint_work);
+	int ret, nr_streams;
+
+	if (ns->nr_streams)
+		return;
+
+	nr_streams = streams_per_ns;
+	if (nr_streams > ns->ctrl->nssa)
+		nr_streams = ns->ctrl->nssa;
+
+	ret = nvme_streams_allocate(ns, nr_streams);
+	if (ret <= 0)
+		goto err;
+
+	ns->nr_streams = ret;
+	dev_info(ns->ctrl->device, "successfully enabled %d streams\n", ret);
+	return;
+err:
+	dev_info(ns->ctrl->device, "failed enabling streams\n");
+	ns->ctrl->failed_streams = true;
+}
+
+static void nvme_configure_streams(struct nvme_ns *ns)
+{
+	/*
+	 * If we already called this function, we've either marked it
+	 * as a failure or set the number of streams.
+	 */
+	if (ns->ctrl->failed_streams)
+		return;
+	if (ns->nr_streams)
+		return;
+	schedule_work(&ns->write_hint_work);
+}
+
+static enum rw_hint nvme_get_write_stream(struct nvme_ns *ns,
+					   struct request *req)
+{
+	enum rw_hint streamid = req->cmd_flags & REQ_WRITE_LIFE_MASK;
+
+	if (req_op(req) != REQ_OP_WRITE ||
+	    !(ns->ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+		return WRITE_LIFE_NONE;
+
+	streamid = req->cmd_flags & REQ_WRITE_LIFE_MASK;
+	if (streamid < ARRAY_SIZE(req->q->write_hints))
+		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
+
+	if (streamid <= ns->nr_streams)
+		return streamid;
+
+	/* for now just round-robin, do something more clever later */
+	return (streamid % ns->nr_streams) + 1;
+}
+
 static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmnd)
 {
+	enum rw_hint stream;
 	u16 control = 0;
 	u32 dsmgmt = 0;
 
@@ -351,6 +498,20 @@  static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
 
+	/*
+	 * If we support streams and this request is a write with a valid
+	 * hint, then flag it as such. If we haven't allocated streams on
+	 * this ns before, do so lazily.
+	 */
+	stream = nvme_get_write_stream(ns, req);
+	if (stream != WRITE_LIFE_NONE) {
+		if (ns->nr_streams) {
+			control |= NVME_RW_DTYPE_STREAMS;
+			dsmgmt |= (stream << 16);
+		} else
+			nvme_configure_streams(ns);
+	}
+
 	if (ns->ms) {
 		switch (ns->pi_type) {
 		case NVME_NS_DPS_PI_TYPE3:
@@ -1650,6 +1811,7 @@  int nvme_init_identify(struct nvme_ctrl *ctrl)
 		dev_pm_qos_hide_latency_tolerance(ctrl->device);
 
 	nvme_configure_apst(ctrl);
+	nvme_probe_directives(ctrl);
 
 	ctrl->identified = true;
 
@@ -2049,6 +2211,8 @@  static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	nvme_set_queue_limits(ctrl, ns->queue);
 
+	INIT_WORK(&ns->write_hint_work, nvme_write_hint_work);
+
 	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
 
 	if (nvme_revalidate_ns(ns, &id))
@@ -2105,6 +2269,8 @@  static void nvme_ns_remove(struct nvme_ns *ns)
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
+	flush_work(&ns->write_hint_work);
+
 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
@@ -2112,6 +2278,8 @@  static void nvme_ns_remove(struct nvme_ns *ns)
 					&nvme_ns_attr_group);
 		if (ns->ndev)
 			nvme_nvm_unregister_sysfs(ns);
+		if (ns->nr_streams)
+			nvme_streams_deallocate(ns);
 		del_gendisk(ns->disk);
 		blk_cleanup_queue(ns->queue);
 	}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9d6a070d4391..918b6126d38b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -118,6 +118,7 @@  enum nvme_ctrl_state {
 struct nvme_ctrl {
 	enum nvme_ctrl_state state;
 	bool identified;
+	bool failed_streams;
 	spinlock_t lock;
 	const struct nvme_ctrl_ops *ops;
 	struct request_queue *admin_q;
@@ -147,6 +148,7 @@  struct nvme_ctrl {
 	u16 oncs;
 	u16 vid;
 	u16 oacs;
+	u16 nssa;
 	atomic_t abort_limit;
 	u8 event_limit;
 	u8 vwc;
@@ -192,6 +194,7 @@  struct nvme_ns {
 	u8 uuid[16];
 
 	unsigned ns_id;
+	unsigned nr_streams;
 	int lba_shift;
 	u16 ms;
 	bool ext;
@@ -203,6 +206,8 @@  struct nvme_ns {
 
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
+
+	struct work_struct write_hint_work;
 };
 
 struct nvme_ctrl_ops {
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b625bacf37ef..8b2f5b140134 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -245,6 +245,7 @@  enum {
 	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
+	NVME_CTRL_OACS_DIRECTIVES		= 1 << 5,
 	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 7,
 };
 
@@ -295,6 +296,19 @@  enum {
 };
 
 enum {
+	NVME_DIR_IDENTIFY		= 0x00,
+	NVME_DIR_STREAMS		= 0x01,
+	NVME_DIR_SND_ID_OP_ENABLE	= 0x01,
+	NVME_DIR_SND_ST_OP_REL_ID	= 0x01,
+	NVME_DIR_SND_ST_OP_REL_RSC	= 0x02,
+	NVME_DIR_RCV_ID_OP_PARAM	= 0x01,
+	NVME_DIR_RCV_ST_OP_PARAM	= 0x01,
+	NVME_DIR_RCV_ST_OP_STATUS	= 0x02,
+	NVME_DIR_RCV_ST_OP_RESOURCE	= 0x03,
+	NVME_DIR_ENDIR			= 0x01,
+};
+
+enum {
 	NVME_NS_FEAT_THIN	= 1 << 0,
 	NVME_NS_FLBAS_LBA_MASK	= 0xf,
 	NVME_NS_FLBAS_META_EXT	= 0x10,
@@ -535,6 +549,7 @@  enum {
 	NVME_RW_PRINFO_PRCHK_APP	= 1 << 11,
 	NVME_RW_PRINFO_PRCHK_GUARD	= 1 << 12,
 	NVME_RW_PRINFO_PRACT		= 1 << 13,
+	NVME_RW_DTYPE_STREAMS		= 1 << 4,
 };
 
 struct nvme_dsm_cmd {
@@ -604,6 +619,8 @@  enum nvme_admin_opcode {
 	nvme_admin_download_fw		= 0x11,
 	nvme_admin_ns_attach		= 0x15,
 	nvme_admin_keep_alive		= 0x18,
+	nvme_admin_directive_send	= 0x19,
+	nvme_admin_directive_recv	= 0x1a,
 	nvme_admin_dbbuf		= 0x7C,
 	nvme_admin_format_nvm		= 0x80,
 	nvme_admin_security_send	= 0x81,
@@ -756,6 +773,24 @@  struct nvme_get_log_page_command {
 	__u32			rsvd14[2];
 };
 
+struct nvme_directive_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__le32			numd;
+	__u8			doper;
+	__u8			dtype;
+	__le16			dspec;
+	__u8			endir;
+	__u8			tdtype;
+	__u16			rsvd15;
+
+	__u32			rsvd16[3];
+};
+
 /*
  * Fabrics subcommands.
  */
@@ -886,6 +921,18 @@  struct nvme_dbbuf {
 	__u32			rsvd12[6];
 };
 
+struct streams_directive_params {
+	__u16	msl;
+	__u16	nssa;
+	__u16	nsso;
+	__u8	rsvd[10];
+	__u32	sws;
+	__u16	sgs;
+	__u16	nsa;
+	__u16	nso;
+	__u8	rsvd2[6];
+};
+
 struct nvme_command {
 	union {
 		struct nvme_common_command common;
@@ -906,6 +953,7 @@  struct nvme_command {
 		struct nvmf_property_set_command prop_set;
 		struct nvmf_property_get_command prop_get;
 		struct nvme_dbbuf dbbuf;
+		struct nvme_directive_cmd directive;
 	};
 };