Message ID | 20180228234006.21093-11-logang@deltatee.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
> We create a configfs attribute in each nvme-fabrics target port to > enable p2p memory use. When enabled, the port will only then use the > p2p memory if a p2p memory device can be found which is behind the > same switch as the RDMA port and all the block devices in use. If > the user enabled it an no devices are found, then the system will > silently fall back on using regular memory. > > If appropriate, that port will allocate memory for the RDMA buffers > for queues from the p2pmem device falling back to system memory should > anything fail. Nice. > Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would > save an extra PCI transfer as the NVME card could just take the data > out of it's own memory. However, at this time, cards with CMB buffers > don't seem to be available. Can you describe what would be the plan to have it when these devices do come along? I'd say that p2p_dev needs to become a nvmet_ns reference and not from nvmet_ctrl. Then, when cmb capable devices come along, the ns can prefer to use its own cmb instead of locating a p2p_dev device? > +static int nvmet_p2pdma_add_client(struct nvmet_ctrl *ctrl, > + struct nvmet_ns *ns) > +{ > + int ret; > + > + if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) { > + pr_err("peer-to-peer DMA is not supported by %s\n", > + ns->device_path); > + return -EINVAL; I'd say that just skip it instead of failing it, in theory one can connect nvme devices via p2p mem and expose other devices in the same subsystem. The message would be pr_debug to reduce chattiness. > + } > + > + ret = pci_p2pdma_add_client(&ctrl->p2p_clients, nvmet_ns_dev(ns)); > + if (ret) > + pr_err("failed to add peer-to-peer DMA client %s: %d\n", > + ns->device_path, ret); > + > + return ret; > +} > + > int nvmet_ns_enable(struct nvmet_ns *ns) > { > struct nvmet_subsys *subsys = ns->subsys; > @@ -299,6 +319,14 @@ int nvmet_ns_enable(struct nvmet_ns *ns) > if (ret) > goto out_blkdev_put; > > + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { > + if (ctrl->p2p_dev) { > + ret = nvmet_p2pdma_add_client(ctrl, ns); > + if (ret) > + goto out_remove_clients; Is this really a fatal failure given that we fall-back to main memory? Why not continue with main memory (and warn at best)? > +/* > + * If allow_p2pmem is set, we will try to use P2P memory for the SGL lists for > + * Ι/O commands. This requires the PCI p2p device to be compatible with the > + * backing device for every namespace on this controller. > + */ > +static void nvmet_setup_p2pmem(struct nvmet_ctrl *ctrl, struct nvmet_req *req) > +{ > + struct nvmet_ns *ns; > + int ret; > + > + if (!req->port->allow_p2pmem || !req->p2p_client) > + return; > + > + mutex_lock(&ctrl->subsys->lock); > + > + ret = pci_p2pdma_add_client(&ctrl->p2p_clients, req->p2p_client); > + if (ret) { > + pr_err("failed adding peer-to-peer DMA client %s: %d\n", > + dev_name(req->p2p_client), ret); > + goto free_devices; > + } > + > + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { > + ret = nvmet_p2pdma_add_client(ctrl, ns); > + if (ret) > + goto free_devices; > + } > + > + ctrl->p2p_dev = pci_p2pmem_find(&ctrl->p2p_clients); This is the first p2p_dev found right? What happens if I have more than a single p2p device? In theory I'd have more p2p memory I can use. Have you considered making pci_p2pmem_find return the least used suitable device? > + if (!ctrl->p2p_dev) { > + pr_info("no supported peer-to-peer memory devices found\n"); > + goto free_devices; > + } > + mutex_unlock(&ctrl->subsys->lock); > + > + pr_info("using peer-to-peer memory on %s\n", pci_name(ctrl->p2p_dev)); > + return; > + > +free_devices: > + pci_p2pdma_client_list_free(&ctrl->p2p_clients); > + mutex_unlock(&ctrl->subsys->lock); > +} > + > +static void nvmet_release_p2pmem(struct nvmet_ctrl *ctrl) > +{ > + if (!ctrl->p2p_dev) > + return; > + > + mutex_lock(&ctrl->subsys->lock); > + > + pci_p2pdma_client_list_free(&ctrl->p2p_clients); > + pci_dev_put(ctrl->p2p_dev); > + ctrl->p2p_dev = NULL; > + > + mutex_unlock(&ctrl->subsys->lock); > +} > + > u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, > struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) > { > @@ -800,6 +890,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, > > INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); > INIT_LIST_HEAD(&ctrl->async_events); > + INIT_LIST_HEAD(&ctrl->p2p_clients); > > memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); > memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); > @@ -855,6 +946,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, > ctrl->kato = DIV_ROUND_UP(kato, 1000); > } > nvmet_start_keep_alive_timer(ctrl); > + nvmet_setup_p2pmem(ctrl, req); > > mutex_lock(&subsys->lock); > list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); > @@ -891,6 +983,7 @@ static void nvmet_ctrl_free(struct kref *ref) > flush_work(&ctrl->async_event_work); > cancel_work_sync(&ctrl->fatal_err_work); > > + nvmet_release_p2pmem(ctrl); > ida_simple_remove(&cntlid_ida, ctrl->cntlid); > > kfree(ctrl->sqs); > diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c > index 28bbdff4a88b..a213f8fc3bf3 100644 > --- a/drivers/nvme/target/io-cmd.c > +++ b/drivers/nvme/target/io-cmd.c > @@ -56,6 +56,9 @@ static void nvmet_execute_rw(struct nvmet_req *req) > op = REQ_OP_READ; > } > > + if (is_pci_p2pdma_page(sg_page(req->sg))) > + op_flags |= REQ_PCI_P2PDMA; > + > sector = le64_to_cpu(req->cmd->rw.slba); > sector <<= (req->ns->blksize_shift - 9); > > diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h > index 417f6c0331cc..85a170914588 100644 > --- a/drivers/nvme/target/nvmet.h > +++ b/drivers/nvme/target/nvmet.h > @@ -64,6 +64,11 @@ static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) > return container_of(to_config_group(item), struct nvmet_ns, group); > } > > +static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns) > +{ > + return disk_to_dev(ns->bdev->bd_disk); > +} > + > struct nvmet_cq { > u16 qid; > u16 size; > @@ -98,6 +103,7 @@ struct nvmet_port { > struct list_head referrals; > void *priv; > bool enabled; > + bool allow_p2pmem; > }; > > static inline struct nvmet_port *to_nvmet_port(struct config_item *item) > @@ -131,6 +137,8 @@ struct nvmet_ctrl { > struct work_struct fatal_err_work; > > struct nvmet_fabrics_ops *ops; > + struct pci_dev *p2p_dev; > + struct list_head p2p_clients; > > char subsysnqn[NVMF_NQN_FIELD_LEN]; > char hostnqn[NVMF_NQN_FIELD_LEN]; > @@ -232,6 +240,8 @@ struct nvmet_req { > > void (*execute)(struct nvmet_req *req); > struct nvmet_fabrics_ops *ops; > + > + struct device *p2p_client; > }; > > static inline void nvmet_set_status(struct nvmet_req *req, u16 status) > diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c > index 020354e11351..7a1f09995ed5 100644 > --- a/drivers/nvme/target/rdma.c > +++ b/drivers/nvme/target/rdma.c > @@ -23,6 +23,7 @@ > #include <linux/string.h> > #include <linux/wait.h> > #include <linux/inet.h> > +#include <linux/pci-p2pdma.h> > #include <asm/unaligned.h> > > #include <rdma/ib_verbs.h> > @@ -68,6 +69,7 @@ struct nvmet_rdma_rsp { > u8 n_rdma; > u32 flags; > u32 invalidate_rkey; > + struct pci_dev *p2p_dev; Given that p2p_client is in nvmet_req, I think it make sense that the p2p_dev itself would also live there. In theory, nothing is preventing FC from using it as well.
> > Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would > > save an extra PCI transfer as the NVME card could just take the data > > out of it's own memory. However, at this time, cards with CMB buffers > > don't seem to be available. > Can you describe what would be the plan to have it when these devices > do come along? I'd say that p2p_dev needs to become a nvmet_ns reference > and not from nvmet_ctrl. Then, when cmb capable devices come along, the > ns can prefer to use its own cmb instead of locating a p2p_dev device? Hi Sagi Thanks for the review! That commit message is somewhat dated as NVMe controllers with CMBs that support RDS and WDS are now commercially available [1]. However we have not yet tried to do any kind of optimization around this yet in terms of determining which p2p_dev to use. Your suggest above looks good and we can look into this kind of optimization in due course. [1] http://www.eideticom.com/uploads/images/NoLoad_Product_Spec.pdf >> + ctrl->p2p_dev = pci_p2pmem_find(&ctrl->p2p_clients); > This is the first p2p_dev found right? What happens if I have more than > a single p2p device? In theory I'd have more p2p memory I can use. Have > you considered making pci_p2pmem_find return the least used suitable > device? Yes pci_p2pmem_find will always return the first valid p2p_dev found. At the very least we should update this allocate over all the valid p2p_dev. Since the load on any given p2p_dev will vary over time I think a random allocation of the devices makes sense (at least for now). Stephen
On 01/03/18 04:03 AM, Sagi Grimberg wrote: > Can you describe what would be the plan to have it when these devices > do come along? I'd say that p2p_dev needs to become a nvmet_ns reference > and not from nvmet_ctrl. Then, when cmb capable devices come along, the > ns can prefer to use its own cmb instead of locating a p2p_dev device? The patchset already supports CMB drives. That's essentially what patch 7 is for. We change the nvme-pci driver to use the p2pmem code to register and manage the CMB memory. After that, it is simply available to the nvmet code. We have already been using this with a couple prototype CMB drives. >> +static int nvmet_p2pdma_add_client(struct nvmet_ctrl *ctrl, >> + struct nvmet_ns *ns) >> +{ >> + int ret; >> + >> + if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) { >> + pr_err("peer-to-peer DMA is not supported by %s\n", >> + ns->device_path); >> + return -EINVAL; > > I'd say that just skip it instead of failing it, in theory one can > connect nvme devices via p2p mem and expose other devices in the > same subsystem. The message would be pr_debug to reduce chattiness. No, it's actually a bit more complicated than you make it. There's a couple cases: 1) The user adds a namespace but there hasn't been a connection and no p2pmem has been selected. In this case the add_client function is never called and the user can add whatever namespace they like. 2) When the first connect happens, nvmet_setup_p2pmem() will call add_client() for each namespace and rdma device. If any of them fail then it does not use a P2P device and falls back, as you'd like, to regular memory. 3) When the user adds a namespace after a port is in use and a compatible P2P device has been found. In this case, if the user tries to add a namespace that is not compatible with the P2P device in use then it fails adding the new namespace. The only alternative here is to tear everything down, ensure there are no P2P enabled buffers open and start using regular memory again... That is very difficult. I also disagree that these messages should be pr_debug. If a user is trying to use P2P memory and they enable it but the system doesn't choose to use their memory they must know why that is so they can make the necessary adjustments. If the system doesn't at least print a dmesg they are in the dark. >> + } >> + >> + ret = pci_p2pdma_add_client(&ctrl->p2p_clients, nvmet_ns_dev(ns)); >> + if (ret) >> + pr_err("failed to add peer-to-peer DMA client %s: %d\n", >> + ns->device_path, ret); >> + >> + return ret; >> +} >> + >> int nvmet_ns_enable(struct nvmet_ns *ns) >> { >> struct nvmet_subsys *subsys = ns->subsys; >> @@ -299,6 +319,14 @@ int nvmet_ns_enable(struct nvmet_ns *ns) >> if (ret) >> goto out_blkdev_put; >> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { >> + if (ctrl->p2p_dev) { >> + ret = nvmet_p2pdma_add_client(ctrl, ns); >> + if (ret) >> + goto out_remove_clients; > > Is this really a fatal failure given that we fall-back to main > memory? Why not continue with main memory (and warn at best)? See above. It's fatal because we are already using p2p memory and we can't easily tear that all down when a user adds a new namespace. >> + ctrl->p2p_dev = pci_p2pmem_find(&ctrl->p2p_clients); > > This is the first p2p_dev found right? What happens if I have more than > a single p2p device? In theory I'd have more p2p memory I can use. Have > you considered making pci_p2pmem_find return the least used suitable > device? Yes, it currently returns the first found. I imagine a bunch of improvements could be made to it in future work. However, I'd probably start with finding the nearest p2p device and then falling back to the least used if that's necessary. At this time though it's not clear how complicated these systems will get and what's actually needed here. >> @@ -68,6 +69,7 @@ struct nvmet_rdma_rsp { >> u8 n_rdma; >> u32 flags; >> u32 invalidate_rkey; >> + struct pci_dev *p2p_dev; > > Given that p2p_client is in nvmet_req, I think it make sense > that the p2p_dev itself would also live there. In theory, nothing > is preventing FC from using it as well. Fair point. I can look at moving it in v3. Thanks, Logan
> On 01/03/18 04:03 AM, Sagi Grimberg wrote: >> Can you describe what would be the plan to have it when these devices >> do come along? I'd say that p2p_dev needs to become a nvmet_ns reference >> and not from nvmet_ctrl. Then, when cmb capable devices come along, the >> ns can prefer to use its own cmb instead of locating a p2p_dev device? > > The patchset already supports CMB drives. That's essentially what patch > 7 is for. We change the nvme-pci driver to use the p2pmem code to > register and manage the CMB memory. After that, it is simply available > to the nvmet code. We have already been using this with a couple > prototype CMB drives. The comment was to your statement: "Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would save an extra PCI transfer as the NVME card could just take the data out of it's own memory. However, at this time, cards with CMB buffers don't seem to be available." Maybe its a left-over which confused me... Anyways, my question still holds. If I rack several of these nvme drives, ideally we would use _their_ cmbs for I/O that is directed to these namespaces. This is why I was suggesting that p2p_dev should live in nvmet_ns and not in nvmet_ctrl as a single p2p_dev used by all namespaces. nvmet_ns seems like a more natural place to host p2p_dev with cmb in mind. >>> +static int nvmet_p2pdma_add_client(struct nvmet_ctrl *ctrl, >>> + struct nvmet_ns *ns) >>> +{ >>> + int ret; >>> + >>> + if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) { >>> + pr_err("peer-to-peer DMA is not supported by %s\n", >>> + ns->device_path); >>> + return -EINVAL; >> >> I'd say that just skip it instead of failing it, in theory one can >> connect nvme devices via p2p mem and expose other devices in the >> same subsystem. The message would be pr_debug to reduce chattiness. > > No, it's actually a bit more complicated than you make it. There's a > couple cases: > > 1) The user adds a namespace but there hasn't been a connection and no > p2pmem has been selected. In this case the add_client function is never > called and the user can add whatever namespace they like. > > 2) When the first connect happens, nvmet_setup_p2pmem() will call > add_client() for each namespace and rdma device. If any of them fail > then it does not use a P2P device and falls back, as you'd like, to > regular memory. > > 3) When the user adds a namespace after a port is in use and a > compatible P2P device has been found. In this case, if the user tries to > add a namespace that is not compatible with the P2P device in use then > it fails adding the new namespace. The only alternative here is to tear > everything down, ensure there are no P2P enabled buffers open and start > using regular memory again... That is very difficult. > Wouldn't it all be simpler if the p2p_dev resolution would be private to the namespace? So is adding some all the namespaces in a subsystem must comply to using p2p? Seems a little bit harsh if its not absolutely needed. Would be nice to export a subsystems between two ports (on two HCAs, across NUMA nodes) where the home node (primary path) would use p2p and failover would use host memory... Can you help me understand why this is absolutely not feasible? > I also disagree that these messages should be pr_debug. If a user is > trying to use P2P memory and they enable it but the system doesn't > choose to use their memory they must know why that is so they can make > the necessary adjustments. If the system doesn't at least print a dmesg > they are in the dark. I was arguing that the user might have intentionally wanted to use p2p where possible but still expose other namespaces over host memory. For this user, the messages are spam. I guess info/warn could also suffice (assuming we allow it, if we fail it then no point of the debug level discussion). > >>> + } >>> + >>> + ret = pci_p2pdma_add_client(&ctrl->p2p_clients, nvmet_ns_dev(ns)); >>> + if (ret) >>> + pr_err("failed to add peer-to-peer DMA client %s: %d\n", >>> + ns->device_path, ret); >>> + >>> + return ret; >>> +} >>> + >>> int nvmet_ns_enable(struct nvmet_ns *ns) >>> { >>> struct nvmet_subsys *subsys = ns->subsys; >>> @@ -299,6 +319,14 @@ int nvmet_ns_enable(struct nvmet_ns *ns) >>> if (ret) >>> goto out_blkdev_put; >>> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { >>> + if (ctrl->p2p_dev) { >>> + ret = nvmet_p2pdma_add_client(ctrl, ns); >>> + if (ret) >>> + goto out_remove_clients; >> >> Is this really a fatal failure given that we fall-back to main >> memory? Why not continue with main memory (and warn at best)? > > See above. It's fatal because we are already using p2p memory and we > can't easily tear that all down when a user adds a new namespace. In my mind, I/O to this namespace would use host memory and be done with it. I guess I still don't understand why this is not possible. >>> + ctrl->p2p_dev = pci_p2pmem_find(&ctrl->p2p_clients); >> >> This is the first p2p_dev found right? What happens if I have more than >> a single p2p device? In theory I'd have more p2p memory I can use. Have >> you considered making pci_p2pmem_find return the least used suitable >> device? > > Yes, it currently returns the first found. I imagine a bunch of > improvements could be made to it in future work. However, I'd probably > start with finding the nearest p2p device and then falling back to the > least used if that's necessary. At this time though it's not clear how > complicated these systems will get and what's actually needed here. Fair enough. Just wandering.
On Thu, Mar 01, 2018 at 08:35:55PM +0200, Sagi Grimberg wrote: > > >On 01/03/18 04:03 AM, Sagi Grimberg wrote: > >>Can you describe what would be the plan to have it when these devices > >>do come along? I'd say that p2p_dev needs to become a nvmet_ns reference > >>and not from nvmet_ctrl. Then, when cmb capable devices come along, the > >>ns can prefer to use its own cmb instead of locating a p2p_dev device? > > > >The patchset already supports CMB drives. That's essentially what patch 7 > >is for. We change the nvme-pci driver to use the p2pmem code to register > >and manage the CMB memory. After that, it is simply available to the nvmet > >code. We have already been using this with a couple prototype CMB drives. > > The comment was to your statement: > "Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would > save an extra PCI transfer as the NVME card could just take the data > out of it's own memory. However, at this time, cards with CMB buffers > don't seem to be available." > > Maybe its a left-over which confused me... > > Anyways, my question still holds. If I rack several of these > nvme drives, ideally we would use _their_ cmbs for I/O that is > directed to these namespaces. This is why I was suggesting that > p2p_dev should live in nvmet_ns and not in nvmet_ctrl as a single > p2p_dev used by all namespaces. I agree, I don't think this series should target anything other than using p2p memory located in one of the devices expected to participate in the p2p trasnaction for a first pass.. locality is super important for p2p, so I don't think things should start out in a way that makes specifying the desired locality hard. This is also why I don't entirely understand why this series has a generic allocator for p2p mem, it makes little sense to me. Why wouldn't the nmve driver just claim the entire CMB of its local device for its own use? Why involve p2p core code in this? Jason
> I agree, I don't think this series should target anything other than > using p2p memory located in one of the devices expected to participate > in the p2p trasnaction for a first pass.. I disagree. There is definitely interest in using a NVMe CMB as a bounce buffer and in deploying systems where only some of the NVMe SSDs below a switch have a CMB but use P2P to access all of them. Also there are some devices that only expose memory and their entire purpose is to act as a p2p device, supporting these devices would be valuable. > locality is super important for p2p, so I don't think things should > start out in a way that makes specifying the desired locality hard. Ensuring that the EPs engaged in p2p are all directly connected to the same PCIe switch ensures locality and (for the switches we have tested) performance. I agree solving the case where the namespace are CMB are on the same PCIe EP is valuable but I don't see it as critical to initial acceptance of the series. Stephen
> Wouldn't it all be simpler if the p2p_dev resolution would be private > to the namespace? > > So is adding some all the namespaces in a subsystem must comply to > using p2p? Seems a little bit harsh if its not absolutely needed. Would > be nice to export a subsystems between two ports (on two HCAs, across > NUMA nodes) where the home node (primary path) would use p2p and > failover would use host memory... > > Can you help me understand why this is absolutely not feasible? Yes, it would simplify things. However, as best as I can tell, when we allocate memory we don't know what namespace it will be used for. If there is a way, I could probably rework it so there's a P2P device per namespace. Can you show me how we'd know the namespace to use in nvmet_rdma_map_sgl_keyed()? Thanks, Logan
On 01/03/18 11:42 AM, Jason Gunthorpe wrote: > On Thu, Mar 01, 2018 at 08:35:55PM +0200, Sagi Grimberg wrote: > This is also why I don't entirely understand why this series has a > generic allocator for p2p mem, it makes little sense to me. > Why wouldn't the nmve driver just claim the entire CMB of its local > device for its own use? Why involve p2p core code in this? We'd prefer to have a generic way to get p2pmem instead of restricting ourselves to only using CMBs. We did work in the past where the P2P memory was part of an IB adapter and not the NVMe card. So this won't work if it's an NVMe only interface. As Stephen mentioned, we also use a couple devices that only exposes P2P memory and this isn't related to NVMe at all. So there's value in having a generic interface and allocator to enable all devices to provide this memory. If there were a hypothetical situation where a driver wants to use some of the memory for P2P and some of it for other purposes then they'd just divide it themselves and only pass a subset to pci_p2pdma_add_resource(). However, as per our changes to the nvme-pci code, it's really just easier to use the allocator for everything inside the driver. Logan
On Thu, Mar 01, 2018 at 12:27:03PM -0700, Logan Gunthorpe wrote: > > > On 01/03/18 11:42 AM, Jason Gunthorpe wrote: > >On Thu, Mar 01, 2018 at 08:35:55PM +0200, Sagi Grimberg wrote: > >This is also why I don't entirely understand why this series has a > >generic allocator for p2p mem, it makes little sense to me. > > >Why wouldn't the nmve driver just claim the entire CMB of its local > >device for its own use? Why involve p2p core code in this? > > We'd prefer to have a generic way to get p2pmem instead of restricting > ourselves to only using CMBs. We did work in the past where the P2P memory > was part of an IB adapter and not the NVMe card. So this won't work if it's > an NVMe only interface. It just seems like it it making it too complicated. In 99% of cases locality is important and you don't want to accidently use a buffer in a 3rd device that has no relation to the two doing the transfer just because it happened to be returned by the generic allocator. I can appreciate you might have some special use case for that, but it absolutely should require special configuration and not just magically happen. You bring up IB adaptor memory - if we put that into the allocator then what is to stop the NMVe driver just using it instead of the CMB buffer? That would be totally wrong in almost all cases. Seems like a very subtle and hard to debug performance trap to leave for the users, and pretty much the only reason to use P2P is performance... So why have such a dangerous interface? Jason
On 01/03/18 03:45 PM, Jason Gunthorpe wrote: > I can appreciate you might have some special use case for that, but it > absolutely should require special configuration and not just magically > happen. Well if driver doesn't want someone doing p2p transfers with the memory it shouldn't publish it to be used for exactly that purpose. > You bring up IB adaptor memory - if we put that into the allocator > then what is to stop the NMVe driver just using it instead of the CMB > buffer? That would be totally wrong in almost all cases. If you mean for SQEs in the NVMe driver, look at the code, it specifically allocates it from it's own device. If you mean the nvmet-rdma then it's using that memory exactly as it was meant to. Again, if the IB driver doesn't want someone to use that memory for P2P transfers it shouldn't publish it as such. > Seems like a very subtle and hard to debug performance trap to leave > for the users, and pretty much the only reason to use P2P is > performance... So why have such a dangerous interface? It's not at all dangerous, the code specifically only uses P2P memory that's local enough. And the majority of the code is there to make sure it will all work in all cases. Honestly, though, I'd love to go back to the case where the user selects which p2pmem device to use, but that was very unpopular last year. It would simplify a bunch of things though. Also, no, the reason to use P2P is not performance. Only if you have very specific hardware can you get a performance bump and it isn't all that significant. The reason to use P2P is so you can design performant systems with small CPUs, less or slower DRAM, and low lane counts to the CPU, etc. Logan
>> We'd prefer to have a generic way to get p2pmem instead of restricting >> ourselves to only using CMBs. We did work in the past where the P2P memory >> was part of an IB adapter and not the NVMe card. So this won't work if it's >> an NVMe only interface. > It just seems like it it making it too complicated. I disagree. Having a common allocator (instead of some separate allocator per driver) makes things simpler. > Seems like a very subtle and hard to debug performance trap to leave > for the users, and pretty much the only reason to use P2P is > performance... So why have such a dangerous interface? P2P is about offloading the memory and PCI subsystem of the host CPU and this is achieved no matter which p2p_dev is used. Stephen
On Thu, Mar 01, 2018 at 11:00:51PM +0000, Stephen Bates wrote: > > Seems like a very subtle and hard to debug performance trap to leave > > for the users, and pretty much the only reason to use P2P is > > performance... So why have such a dangerous interface? > > P2P is about offloading the memory and PCI subsystem of the host CPU > and this is achieved no matter which p2p_dev is used. No, locality matters. If you have a bunch of NICs and bunch of drives and the allocator chooses to put all P2P memory on a single drive your performance will suck horribly even if all the traffic is offloaded. Performance will suck if you have speed differences between the PCI-E devices. Eg a bunch of Gen 3 x8 NVMe cards paired with a Gen 4 x16 NIC will not reach full performance unless the p2p buffers are properly balanced between all cards. This is why I think putting things in one big pool and pretending like any P2P mem is interchangable with any other makes zero sense to me, it is not how the system actually works. Proper locality matters a lot. Jason
On 01/03/18 04:20 PM, Jason Gunthorpe wrote: > On Thu, Mar 01, 2018 at 11:00:51PM +0000, Stephen Bates wrote: > No, locality matters. If you have a bunch of NICs and bunch of drives > and the allocator chooses to put all P2P memory on a single drive your > performance will suck horribly even if all the traffic is offloaded. > > Performance will suck if you have speed differences between the PCI-E > devices. Eg a bunch of Gen 3 x8 NVMe cards paired with a Gen 4 x16 NIC > will not reach full performance unless the p2p buffers are properly > balanced between all cards. This would be solved better by choosing the closest devices in the hierarchy in the p2pmem_find function (ie. preferring devices involved in the transaction). Also, based on the current code, it's a bit of a moot point seeing it requires all devices to be on the same switch. So unless you have a giant switch hidden somewhere you're not going to get too many NIC/NVME pairs. In any case, we are looking at changing both those things so distance in the topology is preferred and the ability to span multiple switches is allowed. We're also discussing changing it to "user picks" which simplifies all of this. Logan
> No, locality matters. If you have a bunch of NICs and bunch of drives > and the allocator chooses to put all P2P memory on a single drive your > performance will suck horribly even if all the traffic is offloaded. Sagi brought this up earlier in his comments about the _find_ function. We are planning to do something about this in the next version. This might be a randomization or a "user-pick" and include a rule around using the p2p_dev on the EP if that EP is part of the transaction. Stephen
On Thu, Mar 01, 2018 at 11:00:51PM +0000, Stephen Bates wrote: > > P2P is about offloading the memory and PCI subsystem of the host CPU > and this is achieved no matter which p2p_dev is used. Even within a device, memory attributes for its various regions may not be the same. There's a meaningful difference between writing to an NVMe CMB vs PMR, and a single device could have both. We don't want to lump these all together without knowing which region you're allocating from, right?
On 01/03/18 04:49 PM, Keith Busch wrote: > On Thu, Mar 01, 2018 at 11:00:51PM +0000, Stephen Bates wrote: >> >> P2P is about offloading the memory and PCI subsystem of the host CPU >> and this is achieved no matter which p2p_dev is used. > > Even within a device, memory attributes for its various regions may not be > the same. There's a meaningful difference between writing to an NVMe CMB > vs PMR, and a single device could have both. We don't want to lump these > all together without knowing which region you're allocating from, right? Yes, PMRs are a whole other beast. I have no idea what anyone is going to actually do with those yet. But no one is proposing to publish them as p2pmem with this code. Logan
> There's a meaningful difference between writing to an NVMe CMB vs PMR
When the PMR spec becomes public we can discuss how best to integrate it into the P2P framework (if at all) ;-).
Stephen
> We don't want to lump these all together without knowing which region you're allocating from, right?
In all seriousness I do agree with you on these Keith in the long term. We would consider adding property flags for the memory as it is added to the p2p core and then the allocator could evolve to intelligently dish it out. Attributes like endurance, latency and special write commit requirements could all become attributes in time. Perhaps one more reason for a central entity for p2p memory allocation so this code does not end up having to go into many different drivers?
Stephen
On 01/03/18 04:57 PM, Stephen Bates wrote: >> We don't want to lump these all together without knowing which region you're allocating from, right? > > In all seriousness I do agree with you on these Keith in the long term. We would consider adding property flags for the memory as it is added to the p2p core and then the allocator could evolve to intelligently dish it out. Attributes like endurance, latency and special write commit requirements could all become attributes in time. Perhaps one more reason for a central entity for p2p memory allocation so this code does not end up having to go into many different drivers? Ugh, I don't know... An allocator for PMR is going to be quite a different animal. It would probably need to look more like a filesystem given you'd need to access the same regions across reboots. I don't think it fits the p2pmem paradigm at all. I think it's something entirely new all together and who knows exactly what it will look like. Logan
On Thu, Mar 01, 2018 at 11:53:16PM +0000, Stephen Bates wrote: > > There's a meaningful difference between writing to an NVMe CMB vs PMR > > When the PMR spec becomes public we can discuss how best to integrate it into the P2P framework (if at all) ;-). http://nvmexpress.org/wp-content/uploads/NVM-Express-1.3-Ratified-TPs.zip
On Thu, Mar 01, 2018 at 11:57:43PM +0000, Stephen Bates wrote: > > We don't want to lump these all together without knowing which region you're allocating from, right? > > In all seriousness I do agree with you on these Keith in the long > term. We would consider adding property flags for the memory as it > is added to the p2p core and then the allocator could evolve to > intelligently dish it out. Attributes like endurance, latency and > special write commit requirements could all become attributes in > time. Perhaps one more reason for a central entity for p2p memory > allocation so this code does not end up having to go into many > different drivers? I fear we will find that every kind of P2P memory has special allocator requirements and dumping it all into one giant pool is not going to be helpful. This allocator is already seems not useful for the P2P target memory on a Mellanox NIC due to the way it has a special allocation flow (windowing) and special usage requirements.. Nor can it be usefull for the doorbell memory in the NIC. Both of these are existing use cases for P2P with out of tree patches.. The allocator seems to only be able to solve the CMB problem, and I think due to that it is better to put this allocator in the NVMe driver and not the core code.. At least until we find a 2nd user that needs the same allocation scheme... Jason
On 02/03/18 09:18 AM, Jason Gunthorpe wrote: > This allocator is already seems not useful for the P2P target memory > on a Mellanox NIC due to the way it has a special allocation flow > (windowing) and special usage requirements.. > > Nor can it be usefull for the doorbell memory in the NIC. No one says every P2P use has to use P2P memory. Doorbells are obviously not P2P memory. But it's the p2mem interface that's important and the interface absolutely does not belong in the NVMe driver. Once you have the P2P memory interface you need an allocator behind it and the obvious place is in the P2P code to handle the common case where you're just mapping a BAR. We don't need to implement a genalloc in every driver that has P2P memory attached with it. If some hardware wants to expose memory that requires complicated allocation it's up to them to solve that problem but that isn't enough justification, to me, to push common code into every driver. > Both of these are existing use cases for P2P with out of tree patches.. And we have out of tree code that uses the generic allocator part of p2pmem. > The allocator seems to only be able to solve the CMB problem, and I > think due to that it is better to put this allocator in the NVMe > driver and not the core code.. At least until we find a 2nd user that > needs the same allocation scheme... See the first P2PMEM RFC. We used Chelsio's NIC instead of the CMB with a very similar allocation scheme. We'd still be enabling that NIC in the same way if we didn't run into hardware issues with it. A simple BAR with memory behind it is always going to be the most common case. Logan
> http://nvmexpress.org/wp-content/uploads/NVM-Express-1.3-Ratified-TPs.zip
@Keith - my apologies.
@Christoph - thanks for the link
So my understanding of when the technical content surrounding new NVMe Technical Proposals (TPs) was wrong. I though the TP content could only be discussed once disclosed in the public standard. I have now learnt that once the TPs are ratified they are publically available!
However, as Logan pointed out, PMRs are not relevant to this series so let's defer discussion on how to support them to a later date!
Stephen
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e6b2d2af81b6..f1ad9d32344d 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -867,12 +867,41 @@ static void nvmet_port_release(struct config_item *item) kfree(port); } +#ifdef CONFIG_PCI_P2PDMA +static ssize_t nvmet_allow_p2pmem_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_nvmet_port(item)->allow_p2pmem); +} + +static ssize_t nvmet_allow_p2pmem_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + bool allow; + int ret; + + ret = strtobool(page, &allow); + if (ret) + return ret; + + down_write(&nvmet_config_sem); + port->allow_p2pmem = allow; + up_write(&nvmet_config_sem); + + return count; +} +CONFIGFS_ATTR(nvmet_, allow_p2pmem); +#endif /* CONFIG_PCI_P2PDMA */ + static struct configfs_attribute *nvmet_port_attrs[] = { &nvmet_attr_addr_adrfam, &nvmet_attr_addr_treq, &nvmet_attr_addr_traddr, &nvmet_attr_addr_trsvcid, &nvmet_attr_addr_trtype, +#ifdef CONFIG_PCI_P2PDMA + &nvmet_attr_allow_p2pmem, +#endif NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 0bd737117a80..59847aec27db 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/rculist.h> +#include <linux/pci-p2pdma.h> #include "nvmet.h" @@ -271,6 +272,25 @@ void nvmet_put_namespace(struct nvmet_ns *ns) percpu_ref_put(&ns->ref); } +static int nvmet_p2pdma_add_client(struct nvmet_ctrl *ctrl, + struct nvmet_ns *ns) +{ + int ret; + + if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) { + pr_err("peer-to-peer DMA is not supported by %s\n", + ns->device_path); + return -EINVAL; + } + + ret = pci_p2pdma_add_client(&ctrl->p2p_clients, nvmet_ns_dev(ns)); + if (ret) + pr_err("failed to add peer-to-peer DMA client %s: %d\n", + ns->device_path, ret); + + return ret; +} + int nvmet_ns_enable(struct nvmet_ns *ns) { struct nvmet_subsys *subsys = ns->subsys; @@ -299,6 +319,14 @@ int nvmet_ns_enable(struct nvmet_ns *ns) if (ret) goto out_blkdev_put; + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->p2p_dev) { + ret = nvmet_p2pdma_add_client(ctrl, ns); + if (ret) + goto out_remove_clients; + } + } + if (ns->nsid > subsys->max_nsid) subsys->max_nsid = ns->nsid; @@ -328,6 +356,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns) out_unlock: mutex_unlock(&subsys->lock); return ret; +out_remove_clients: + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + pci_p2pdma_remove_client(&ctrl->p2p_clients, nvmet_ns_dev(ns)); out_blkdev_put: blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); ns->bdev = NULL; @@ -363,8 +394,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns) percpu_ref_exit(&ns->ref); mutex_lock(&subsys->lock); - list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + pci_p2pdma_remove_client(&ctrl->p2p_clients, nvmet_ns_dev(ns)); nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + } if (ns->bdev) blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); @@ -761,6 +794,63 @@ bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, return __nvmet_host_allowed(subsys, hostnqn); } +/* + * If allow_p2pmem is set, we will try to use P2P memory for the SGL lists for + * Ι/O commands. This requires the PCI p2p device to be compatible with the + * backing device for every namespace on this controller. + */ +static void nvmet_setup_p2pmem(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +{ + struct nvmet_ns *ns; + int ret; + + if (!req->port->allow_p2pmem || !req->p2p_client) + return; + + mutex_lock(&ctrl->subsys->lock); + + ret = pci_p2pdma_add_client(&ctrl->p2p_clients, req->p2p_client); + if (ret) { + pr_err("failed adding peer-to-peer DMA client %s: %d\n", + dev_name(req->p2p_client), ret); + goto free_devices; + } + + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + ret = nvmet_p2pdma_add_client(ctrl, ns); + if (ret) + goto free_devices; + } + + ctrl->p2p_dev = pci_p2pmem_find(&ctrl->p2p_clients); + if (!ctrl->p2p_dev) { + pr_info("no supported peer-to-peer memory devices found\n"); + goto free_devices; + } + mutex_unlock(&ctrl->subsys->lock); + + pr_info("using peer-to-peer memory on %s\n", pci_name(ctrl->p2p_dev)); + return; + +free_devices: + pci_p2pdma_client_list_free(&ctrl->p2p_clients); + mutex_unlock(&ctrl->subsys->lock); +} + +static void nvmet_release_p2pmem(struct nvmet_ctrl *ctrl) +{ + if (!ctrl->p2p_dev) + return; + + mutex_lock(&ctrl->subsys->lock); + + pci_p2pdma_client_list_free(&ctrl->p2p_clients); + pci_dev_put(ctrl->p2p_dev); + ctrl->p2p_dev = NULL; + + mutex_unlock(&ctrl->subsys->lock); +} + u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) { @@ -800,6 +890,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); + INIT_LIST_HEAD(&ctrl->p2p_clients); memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); @@ -855,6 +946,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, ctrl->kato = DIV_ROUND_UP(kato, 1000); } nvmet_start_keep_alive_timer(ctrl); + nvmet_setup_p2pmem(ctrl, req); mutex_lock(&subsys->lock); list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); @@ -891,6 +983,7 @@ static void nvmet_ctrl_free(struct kref *ref) flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fatal_err_work); + nvmet_release_p2pmem(ctrl); ida_simple_remove(&cntlid_ida, ctrl->cntlid); kfree(ctrl->sqs); diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 28bbdff4a88b..a213f8fc3bf3 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -56,6 +56,9 @@ static void nvmet_execute_rw(struct nvmet_req *req) op = REQ_OP_READ; } + if (is_pci_p2pdma_page(sg_page(req->sg))) + op_flags |= REQ_PCI_P2PDMA; + sector = le64_to_cpu(req->cmd->rw.slba); sector <<= (req->ns->blksize_shift - 9); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 417f6c0331cc..85a170914588 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -64,6 +64,11 @@ static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) return container_of(to_config_group(item), struct nvmet_ns, group); } +static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns) +{ + return disk_to_dev(ns->bdev->bd_disk); +} + struct nvmet_cq { u16 qid; u16 size; @@ -98,6 +103,7 @@ struct nvmet_port { struct list_head referrals; void *priv; bool enabled; + bool allow_p2pmem; }; static inline struct nvmet_port *to_nvmet_port(struct config_item *item) @@ -131,6 +137,8 @@ struct nvmet_ctrl { struct work_struct fatal_err_work; struct nvmet_fabrics_ops *ops; + struct pci_dev *p2p_dev; + struct list_head p2p_clients; char subsysnqn[NVMF_NQN_FIELD_LEN]; char hostnqn[NVMF_NQN_FIELD_LEN]; @@ -232,6 +240,8 @@ struct nvmet_req { void (*execute)(struct nvmet_req *req); struct nvmet_fabrics_ops *ops; + + struct device *p2p_client; }; static inline void nvmet_set_status(struct nvmet_req *req, u16 status) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 020354e11351..7a1f09995ed5 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -23,6 +23,7 @@ #include <linux/string.h> #include <linux/wait.h> #include <linux/inet.h> +#include <linux/pci-p2pdma.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> @@ -68,6 +69,7 @@ struct nvmet_rdma_rsp { u8 n_rdma; u32 flags; u32 invalidate_rkey; + struct pci_dev *p2p_dev; struct list_head wait_list; struct list_head free_list; @@ -426,12 +428,18 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) if (rsp->n_rdma) { rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, - queue->cm_id->port_num, rsp->req.sg, - rsp->req.sg_cnt, nvmet_data_dir(&rsp->req), 0); + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req), + rsp->p2p_dev ? RDMA_RW_CTX_FLAG_PCI_P2PDMA : 0); } - if (rsp->req.sg != &rsp->cmd->inline_sg) - sgl_free(rsp->req.sg); + if (rsp->req.sg != &rsp->cmd->inline_sg) { + if (rsp->p2p_dev) + pci_p2pmem_free_sgl(rsp->p2p_dev, rsp->req.sg, + rsp->req.sg_cnt); + else + sgl_free(rsp->req.sg); + } if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) nvmet_rdma_process_wr_wait_list(queue); @@ -567,19 +575,34 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, u64 addr = le64_to_cpu(sgl->addr); u32 len = get_unaligned_le24(sgl->length); u32 key = get_unaligned_le32(sgl->key); + struct pci_dev *p2p_dev = NULL; int ret; /* no data command? */ if (!len) return 0; - rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); - if (!rsp->req.sg) - return NVME_SC_INTERNAL; + if (rsp->queue->nvme_sq.ctrl) + p2p_dev = rsp->queue->nvme_sq.ctrl->p2p_dev; + + rsp->p2p_dev = NULL; + if (rsp->queue->nvme_sq.qid && p2p_dev) { + ret = pci_p2pmem_alloc_sgl(p2p_dev, &rsp->req.sg, + &rsp->req.sg_cnt, len); + if (!ret) + rsp->p2p_dev = p2p_dev; + } + + if (!rsp->p2p_dev) { + rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); + if (!rsp->req.sg) + return NVME_SC_INTERNAL; + } ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, - nvmet_data_dir(&rsp->req), 0); + nvmet_data_dir(&rsp->req), + rsp->p2p_dev ? RDMA_RW_CTX_FLAG_PCI_P2PDMA : 0); if (ret < 0) return NVME_SC_INTERNAL; rsp->req.transfer_len += len; @@ -658,6 +681,8 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, cmd->send_sge.addr, cmd->send_sge.length, DMA_TO_DEVICE); + cmd->req.p2p_client = &queue->dev->device->dev; + if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, &queue->nvme_sq, &nvmet_rdma_ops)) return;