Message ID | 20190426131852.30142-3-bmt@zurich.ibm.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | SIW: Request for Comments | expand |
On Fri, Apr 26, 2019 at 03:18:42PM +0200, Bernard Metzler wrote: > From: Bernard Metzler <bmt@rims.zurich.ibm.com> > > Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> > --- > drivers/infiniband/sw/siw/siw.h | 733 ++++++++++++++++++++++++++++++++ > 1 file changed, 733 insertions(+) > create mode 100644 drivers/infiniband/sw/siw/siw.h > > diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h > new file mode 100644 > index 000000000000..9a3c2abbd858 > --- /dev/null > +++ b/drivers/infiniband/sw/siw/siw.h > @@ -0,0 +1,733 @@ > +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ > + > +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ > +/* Copyright (c) 2008-2019, IBM Corporation */ > + > +#ifndef _SIW_H > +#define _SIW_H > + > +#include <linux/idr.h> > +#include <rdma/ib_verbs.h> > +#include <linux/socket.h> > +#include <linux/skbuff.h> > +#include <linux/in.h> > +#include <linux/fs.h> > +#include <linux/netdevice.h> > +#include <crypto/hash.h> > +#include <linux/resource.h> /* MLOCK_LIMIT */ > +#include <linux/module.h> > +#include <linux/version.h> > +#include <linux/llist.h> > +#include <linux/mm.h> > +#include <linux/sched/signal.h> > + > +#include <rdma/siw_user.h> > +#include "iwarp.h" > + > +/* driver debugging enabled */ > +#define DEBUG I clearly remember that we asked to remove this. > + spinlock_t lock; > + > + /* object management */ > + struct idr qp_idr; > + struct idr mem_idr; Why IDR and not XArray? > + /* active objects statistics */ refcount_t please > + atomic_t num_qp; > + atomic_t num_cq; > + atomic_t num_pd; > + atomic_t num_mr; > + atomic_t num_srq; > + atomic_t num_cep; > + atomic_t num_ctx; > + <...> > +/* > + * Generic memory representation for registered siw memory. > + * Memory lookup always via higher 24 bit of STag (STag index). > + * Object relates to memory window if embedded mr pointer is valid > + */ > +struct siw_mem { > + struct siw_device *sdev; > + struct kref ref; <...> > +struct siw_qp { > + struct ib_qp base_qp; > + struct siw_device *sdev; > + struct kref ref; I wonder if kref is needed in driver code. <...> > +/* Varia */ ???? > +extern void siw_cq_flush(struct siw_cq *cq); > +extern void siw_sq_flush(struct siw_qp *qp); > +extern void siw_rq_flush(struct siw_qp *qp); > +extern int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); > +extern void siw_print_hdr(union iwarp_hdr *hdr, int qp_id, char *string); > +#endif > -- > 2.17.2 >
-----"Leon Romanovsky" <leon@kernel.org> wrote: ----- >To: "Bernard Metzler" <bmt@zurich.ibm.com> >From: "Leon Romanovsky" <leon@kernel.org> >Date: 04/28/2019 01:07PM >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" ><bmt@rims.zurich.ibm.com> >Subject: Re: [PATCH v8 02/12] SIW main include file > >On Fri, Apr 26, 2019 at 03:18:42PM +0200, Bernard Metzler wrote: >> From: Bernard Metzler <bmt@rims.zurich.ibm.com> >> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> >> --- >> drivers/infiniband/sw/siw/siw.h | 733 >++++++++++++++++++++++++++++++++ >> 1 file changed, 733 insertions(+) >> create mode 100644 drivers/infiniband/sw/siw/siw.h >> >> diff --git a/drivers/infiniband/sw/siw/siw.h >b/drivers/infiniband/sw/siw/siw.h >> new file mode 100644 >> index 000000000000..9a3c2abbd858 >> --- /dev/null >> +++ b/drivers/infiniband/sw/siw/siw.h >> @@ -0,0 +1,733 @@ >> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ >> + >> +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ >> +/* Copyright (c) 2008-2019, IBM Corporation */ >> + >> +#ifndef _SIW_H >> +#define _SIW_H >> + >> +#include <linux/idr.h> >> +#include <rdma/ib_verbs.h> >> +#include <linux/socket.h> >> +#include <linux/skbuff.h> >> +#include <linux/in.h> >> +#include <linux/fs.h> >> +#include <linux/netdevice.h> >> +#include <crypto/hash.h> >> +#include <linux/resource.h> /* MLOCK_LIMIT */ >> +#include <linux/module.h> >> +#include <linux/version.h> >> +#include <linux/llist.h> >> +#include <linux/mm.h> >> +#include <linux/sched/signal.h> >> + >> +#include <rdma/siw_user.h> >> +#include "iwarp.h" >> + >> +/* driver debugging enabled */ >> +#define DEBUG > >I clearly remember that we asked to remove this. Absolutely. Sorry, it sneaked in again since I did some debugging. Will remove... > >> + spinlock_t lock; >> + >> + /* object management */ >> + struct idr qp_idr; >> + struct idr mem_idr; > >Why IDR and not XArray? Memory access keys and QP IDs are generated as random numbers, since both are exposed to the application. Since XArray is not designed for sparsely distributed id ranges, I am still in favor of IDR for these two resources. > >> + /* active objects statistics */ > >refcount_t please > >> + atomic_t num_qp; >> + atomic_t num_cq; >> + atomic_t num_pd; >> + atomic_t num_mr; >> + atomic_t num_srq; >> + atomic_t num_cep; >> + atomic_t num_ctx; >> + > These counters are only used to limit the amount of resources allocated to their max values per device. Since there is no equivalent for atomic_inc_return() for refcounters I'd suggest to stay with atomic_t: refcount_inc(&sdev->num_mr); if (refcount_read(&sdev->num_mr) > SIW_MAX_MR) { siw_dbg_pd(pd, "too many mr's\n"); rv = -ENOMEM; goto err_out; } vs. if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { siw_dbg_pd(pd, "too many mr's\n"); rv = -ENOMEM; goto err_out; } ><...> > >> +/* >> + * Generic memory representation for registered siw memory. >> + * Memory lookup always via higher 24 bit of STag (STag index). >> + * Object relates to memory window if embedded mr pointer is valid >> + */ >> +struct siw_mem { >> + struct siw_device *sdev; >> + struct kref ref; > ><...> > >> +struct siw_qp { >> + struct ib_qp base_qp; >> + struct siw_device *sdev; >> + struct kref ref; > >I wonder if kref is needed in driver code. > ><...> Memory and QP objects are, while generally maintained by the RDMA midlayer, also guarded against deallocation while still in use by the driver. The code tries to avoid taking resource locks for operations like in flight RDMA reads on a memory object. So it makes use of the release function in kref_put(). > >> +/* Varia */ > >???? > right, will remove useless comment. >> +extern void siw_cq_flush(struct siw_cq *cq); >> +extern void siw_sq_flush(struct siw_qp *qp); >> +extern void siw_rq_flush(struct siw_qp *qp); >> +extern int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); >> +extern void siw_print_hdr(union iwarp_hdr *hdr, int qp_id, char >*string); >> +#endif >> -- >> 2.17.2 >> > >
On Sun, May 05, 2019 at 04:54:50PM +0000, Bernard Metzler wrote: > -----"Leon Romanovsky" <leon@kernel.org> wrote: ----- > > >To: "Bernard Metzler" <bmt@zurich.ibm.com> > >From: "Leon Romanovsky" <leon@kernel.org> > >Date: 04/28/2019 01:07PM > >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" > ><bmt@rims.zurich.ibm.com> > >Subject: Re: [PATCH v8 02/12] SIW main include file > > > >On Fri, Apr 26, 2019 at 03:18:42PM +0200, Bernard Metzler wrote: > >> From: Bernard Metzler <bmt@rims.zurich.ibm.com> > >> > >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> > >> --- > >> drivers/infiniband/sw/siw/siw.h | 733 > >++++++++++++++++++++++++++++++++ > >> 1 file changed, 733 insertions(+) > >> create mode 100644 drivers/infiniband/sw/siw/siw.h > >> > >> diff --git a/drivers/infiniband/sw/siw/siw.h > >b/drivers/infiniband/sw/siw/siw.h > >> new file mode 100644 > >> index 000000000000..9a3c2abbd858 > >> --- /dev/null > >> +++ b/drivers/infiniband/sw/siw/siw.h > >> @@ -0,0 +1,733 @@ > >> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ > >> + > >> +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ > >> +/* Copyright (c) 2008-2019, IBM Corporation */ > >> + > >> +#ifndef _SIW_H > >> +#define _SIW_H > >> + > >> +#include <linux/idr.h> > >> +#include <rdma/ib_verbs.h> > >> +#include <linux/socket.h> > >> +#include <linux/skbuff.h> > >> +#include <linux/in.h> > >> +#include <linux/fs.h> > >> +#include <linux/netdevice.h> > >> +#include <crypto/hash.h> > >> +#include <linux/resource.h> /* MLOCK_LIMIT */ > >> +#include <linux/module.h> > >> +#include <linux/version.h> > >> +#include <linux/llist.h> > >> +#include <linux/mm.h> > >> +#include <linux/sched/signal.h> > >> + > >> +#include <rdma/siw_user.h> > >> +#include "iwarp.h" > >> + > >> +/* driver debugging enabled */ > >> +#define DEBUG > > > >I clearly remember that we asked to remove this. > > Absolutely. Sorry, it sneaked in again since I did some > debugging. Will remove... > > > >> + spinlock_t lock; > >> + > >> + /* object management */ > >> + struct idr qp_idr; > >> + struct idr mem_idr; > > > >Why IDR and not XArray? > > Memory access keys and QP IDs are generated as random > numbers, since both are exposed to the application. > Since XArray is not designed for sparsely distributed > id ranges, I am still in favor of IDR for these two > resources. Why do those number need to be "sparsely distributed"? Isn't this ID to query in some internal object database? At lease QP number is for sure can be sequential, there is no extra value of creating it to be random. > > > > >> + /* active objects statistics */ > > > >refcount_t please > > > >> + atomic_t num_qp; > >> + atomic_t num_cq; > >> + atomic_t num_pd; > >> + atomic_t num_mr; > >> + atomic_t num_srq; > >> + atomic_t num_cep; > >> + atomic_t num_ctx; > >> + > > > > These counters are only used to limit the amount of > resources allocated to their max values per device. > > Since there is no equivalent for atomic_inc_return() > for refcounters I'd suggest to stay with atomic_t: > > refcount_inc(&sdev->num_mr); > if (refcount_read(&sdev->num_mr) > SIW_MAX_MR) { > siw_dbg_pd(pd, "too many mr's\n"); > rv = -ENOMEM; > goto err_out; > } > vs. > if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { > siw_dbg_pd(pd, "too many mr's\n"); > rv = -ENOMEM; > goto err_out; > } > No problem, anyway we are planning to generalize it and remove from all drivers. > > > ><...> > > > >> +/* > >> + * Generic memory representation for registered siw memory. > >> + * Memory lookup always via higher 24 bit of STag (STag index). > >> + * Object relates to memory window if embedded mr pointer is valid > >> + */ > >> +struct siw_mem { > >> + struct siw_device *sdev; > >> + struct kref ref; > > > ><...> > > > >> +struct siw_qp { > >> + struct ib_qp base_qp; > >> + struct siw_device *sdev; > >> + struct kref ref; > > > >I wonder if kref is needed in driver code. > > > ><...> > > Memory and QP objects are, while generally maintained > by the RDMA midlayer, also guarded against deallocation > while still in use by the driver. The code tries to avoid > taking resource locks for operations like in flight RDMA reads > on a memory object. So it makes use of the release function > in kref_put(). krefs are not replacing the locks, but protect from release during the operation on that object. I don't understand the connection that your draw here between RDMA midlayer and in-flight operations. > > > >> +/* Varia */ > > > >???? > > > right, will remove useless comment. > > >> +extern void siw_cq_flush(struct siw_cq *cq); > >> +extern void siw_sq_flush(struct siw_qp *qp); > >> +extern void siw_rq_flush(struct siw_qp *qp); > >> +extern int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); > >> +extern void siw_print_hdr(union iwarp_hdr *hdr, int qp_id, char > >*string); > >> +#endif > >> -- > >> 2.17.2 > >> > > > > >
-----"Leon Romanovsky" <leon@kernel.org> wrote: ----- >To: "Bernard Metzler" <BMT@zurich.ibm.com> >From: "Leon Romanovsky" <leon@kernel.org> >Date: 05/05/2019 07:10PM >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" ><bmt@rims.zurich.ibm.com> >Subject: Re: [PATCH v8 02/12] SIW main include file > >On Sun, May 05, 2019 at 04:54:50PM +0000, Bernard Metzler wrote: >> -----"Leon Romanovsky" <leon@kernel.org> wrote: ----- >> >> >To: "Bernard Metzler" <bmt@zurich.ibm.com> >> >From: "Leon Romanovsky" <leon@kernel.org> >> >Date: 04/28/2019 01:07PM >> >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" >> ><bmt@rims.zurich.ibm.com> >> >Subject: Re: [PATCH v8 02/12] SIW main include file >> > >> >On Fri, Apr 26, 2019 at 03:18:42PM +0200, Bernard Metzler wrote: >> >> From: Bernard Metzler <bmt@rims.zurich.ibm.com> >> >> >> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> >> >> --- >> >> drivers/infiniband/sw/siw/siw.h | 733 >> >++++++++++++++++++++++++++++++++ >> >> 1 file changed, 733 insertions(+) >> >> create mode 100644 drivers/infiniband/sw/siw/siw.h >> >> >> >> diff --git a/drivers/infiniband/sw/siw/siw.h >> >b/drivers/infiniband/sw/siw/siw.h >> >> new file mode 100644 >> >> index 000000000000..9a3c2abbd858 >> >> --- /dev/null >> >> +++ b/drivers/infiniband/sw/siw/siw.h >> >> @@ -0,0 +1,733 @@ >> >> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ >> >> + >> >> +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ >> >> +/* Copyright (c) 2008-2019, IBM Corporation */ >> >> + >> >> +#ifndef _SIW_H >> >> +#define _SIW_H >> >> + >> >> +#include <linux/idr.h> >> >> +#include <rdma/ib_verbs.h> >> >> +#include <linux/socket.h> >> >> +#include <linux/skbuff.h> >> >> +#include <linux/in.h> >> >> +#include <linux/fs.h> >> >> +#include <linux/netdevice.h> >> >> +#include <crypto/hash.h> >> >> +#include <linux/resource.h> /* MLOCK_LIMIT */ >> >> +#include <linux/module.h> >> >> +#include <linux/version.h> >> >> +#include <linux/llist.h> >> >> +#include <linux/mm.h> >> >> +#include <linux/sched/signal.h> >> >> + >> >> +#include <rdma/siw_user.h> >> >> +#include "iwarp.h" >> >> + >> >> +/* driver debugging enabled */ >> >> +#define DEBUG >> > >> >I clearly remember that we asked to remove this. >> >> Absolutely. Sorry, it sneaked in again since I did some >> debugging. Will remove... >> > >> >> + spinlock_t lock; >> >> + >> >> + /* object management */ >> >> + struct idr qp_idr; >> >> + struct idr mem_idr; >> > >> >Why IDR and not XArray? >> >> Memory access keys and QP IDs are generated as random >> numbers, since both are exposed to the application. >> Since XArray is not designed for sparsely distributed >> id ranges, I am still in favor of IDR for these two >> resources. > >Why do those number need to be "sparsely distributed"? >Isn't this ID to query in some internal object database? > >At lease QP number is for sure can be sequential, there is no extra >value of creating it to be random. > Yes, I can drop that for QP's. At the other hand, those QP numbers are used by the application to reference the right QP during siw_accept and siw_connect. So for the application it becomes easy to guess amd hijack a valid QP number, if it is not random. I am not sure if the RDMA midlayer takes care if the application uses the right QP number during accept/connect. I can add checks into siw (e.g. right PD or some such). >> >> > >> >> + /* active objects statistics */ >> > >> >refcount_t please >> > >> >> + atomic_t num_qp; >> >> + atomic_t num_cq; >> >> + atomic_t num_pd; >> >> + atomic_t num_mr; >> >> + atomic_t num_srq; >> >> + atomic_t num_cep; >> >> + atomic_t num_ctx; >> >> + >> > >> >> These counters are only used to limit the amount of >> resources allocated to their max values per device. >> >> Since there is no equivalent for atomic_inc_return() >> for refcounters I'd suggest to stay with atomic_t: >> >> refcount_inc(&sdev->num_mr); >> if (refcount_read(&sdev->num_mr) > SIW_MAX_MR) { >> siw_dbg_pd(pd, "too many mr's\n"); >> rv = -ENOMEM; >> goto err_out; >> } >> vs. >> if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { >> siw_dbg_pd(pd, "too many mr's\n"); >> rv = -ENOMEM; >> goto err_out; >> } >> > >No problem, anyway we are planning to generalize it and remove from >all drivers. > Okay >> >> >> ><...> >> > >> >> +/* >> >> + * Generic memory representation for registered siw memory. >> >> + * Memory lookup always via higher 24 bit of STag (STag index). >> >> + * Object relates to memory window if embedded mr pointer is >valid >> >> + */ >> >> +struct siw_mem { >> >> + struct siw_device *sdev; >> >> + struct kref ref; >> > >> ><...> >> > >> >> +struct siw_qp { >> >> + struct ib_qp base_qp; >> >> + struct siw_device *sdev; >> >> + struct kref ref; >> > >> >I wonder if kref is needed in driver code. >> > >> ><...> >> >> Memory and QP objects are, while generally maintained >> by the RDMA midlayer, also guarded against deallocation >> while still in use by the driver. The code tries to avoid >> taking resource locks for operations like in flight RDMA reads >> on a memory object. So it makes use of the release function >> in kref_put(). > >krefs are not replacing the locks, but protect from release >during the operation on that object. I don't understand >the connection that your draw here between RDMA midlayer >and in-flight operations. An in flight RDMA Read/Write to/from a memory must finish before the memory gets invalidated. I avoid spinlocking + irq disabling each time memory gets accessed. > >> > >> >> +/* Varia */ >> > >> >???? >> > >> right, will remove useless comment. >> >> >> +extern void siw_cq_flush(struct siw_cq *cq); >> >> +extern void siw_sq_flush(struct siw_qp *qp); >> >> +extern void siw_rq_flush(struct siw_qp *qp); >> >> +extern int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); >> >> +extern void siw_print_hdr(union iwarp_hdr *hdr, int qp_id, char >> >*string); >> >> +#endif >> >> -- >> >> 2.17.2 >> >> >> > >> > >> > >
On Tue, May 07, 2019 at 03:54:45PM +0000, Bernard Metzler wrote: > > >To: "Bernard Metzler" <BMT@zurich.ibm.com> > >From: "Leon Romanovsky" <leon@kernel.org> > >Date: 05/05/2019 07:10PM > >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" > ><bmt@rims.zurich.ibm.com> > >Subject: Re: [PATCH v8 02/12] SIW main include file > > > >On Sun, May 05, 2019 at 04:54:50PM +0000, Bernard Metzler wrote: > >> > >> >To: "Bernard Metzler" <bmt@zurich.ibm.com> > >> >From: "Leon Romanovsky" <leon@kernel.org> > >> >Date: 04/28/2019 01:07PM > >> >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" > >> ><bmt@rims.zurich.ibm.com> > >> >Subject: Re: [PATCH v8 02/12] SIW main include file > >> > > >> >On Fri, Apr 26, 2019 at 03:18:42PM +0200, Bernard Metzler wrote: > >> >> From: Bernard Metzler <bmt@rims.zurich.ibm.com> > >> >> > >> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> > >> >> drivers/infiniband/sw/siw/siw.h | 733 > >> >++++++++++++++++++++++++++++++++ > >> >> 1 file changed, 733 insertions(+) > >> >> create mode 100644 drivers/infiniband/sw/siw/siw.h > >> >> > >> >> diff --git a/drivers/infiniband/sw/siw/siw.h > >> >b/drivers/infiniband/sw/siw/siw.h > >> >> new file mode 100644 > >> >> index 000000000000..9a3c2abbd858 > >> >> +++ b/drivers/infiniband/sw/siw/siw.h > >> >> @@ -0,0 +1,733 @@ > >> >> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ > >> >> + > >> >> +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ > >> >> +/* Copyright (c) 2008-2019, IBM Corporation */ > >> >> + > >> >> +#ifndef _SIW_H > >> >> +#define _SIW_H > >> >> + > >> >> +#include <linux/idr.h> > >> >> +#include <rdma/ib_verbs.h> > >> >> +#include <linux/socket.h> > >> >> +#include <linux/skbuff.h> > >> >> +#include <linux/in.h> > >> >> +#include <linux/fs.h> > >> >> +#include <linux/netdevice.h> > >> >> +#include <crypto/hash.h> > >> >> +#include <linux/resource.h> /* MLOCK_LIMIT */ > >> >> +#include <linux/module.h> > >> >> +#include <linux/version.h> > >> >> +#include <linux/llist.h> > >> >> +#include <linux/mm.h> > >> >> +#include <linux/sched/signal.h> > >> >> + > >> >> +#include <rdma/siw_user.h> > >> >> +#include "iwarp.h" > >> >> + > >> >> +/* driver debugging enabled */ > >> >> +#define DEBUG > >> > > >> >I clearly remember that we asked to remove this. > >> > >> Absolutely. Sorry, it sneaked in again since I did some > >> debugging. Will remove... > >> > > >> >> + spinlock_t lock; > >> >> + > >> >> + /* object management */ > >> >> + struct idr qp_idr; > >> >> + struct idr mem_idr; > >> > > >> >Why IDR and not XArray? > >> > >> Memory access keys and QP IDs are generated as random > >> numbers, since both are exposed to the application. > >> Since XArray is not designed for sparsely distributed > >> id ranges, I am still in favor of IDR for these two > >> resources. IDR and xarray have identical underlying storage so this is nonsense No new idr's or radix tree users will be accepted into rdma.... Use xarray Jason
-----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: ----- >To: "Bernard Metzler" <BMT@zurich.ibm.com> >From: "Jason Gunthorpe" <jgg@ziepe.ca> >Date: 05/07/2019 07:09PM >Cc: "Leon Romanovsky" <leon@kernel.org>, linux-rdma@vger.kernel.org >Subject: Re: [PATCH v8 02/12] SIW main include file > >On Tue, May 07, 2019 at 03:54:45PM +0000, Bernard Metzler wrote: >> >> >To: "Bernard Metzler" <BMT@zurich.ibm.com> >> >From: "Leon Romanovsky" <leon@kernel.org> >> >Date: 05/05/2019 07:10PM >> >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" >> ><bmt@rims.zurich.ibm.com> >> >Subject: Re: [PATCH v8 02/12] SIW main include file >> > >> >On Sun, May 05, 2019 at 04:54:50PM +0000, Bernard Metzler wrote: >> >> >> >> >To: "Bernard Metzler" <bmt@zurich.ibm.com> >> >> >From: "Leon Romanovsky" <leon@kernel.org> >> >> >Date: 04/28/2019 01:07PM >> >> >Cc: linux-rdma@vger.kernel.org, "Bernard Metzler" >> >> ><bmt@rims.zurich.ibm.com> >> >> >Subject: Re: [PATCH v8 02/12] SIW main include file >> >> > >> >> >On Fri, Apr 26, 2019 at 03:18:42PM +0200, Bernard Metzler >wrote: >> >> >> From: Bernard Metzler <bmt@rims.zurich.ibm.com> >> >> >> >> >> >> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> >> >> >> drivers/infiniband/sw/siw/siw.h | 733 >> >> >++++++++++++++++++++++++++++++++ >> >> >> 1 file changed, 733 insertions(+) >> >> >> create mode 100644 drivers/infiniband/sw/siw/siw.h >> >> >> >> >> >> diff --git a/drivers/infiniband/sw/siw/siw.h >> >> >b/drivers/infiniband/sw/siw/siw.h >> >> >> new file mode 100644 >> >> >> index 000000000000..9a3c2abbd858 >> >> >> +++ b/drivers/infiniband/sw/siw/siw.h >> >> >> @@ -0,0 +1,733 @@ >> >> >> +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ >> >> >> + >> >> >> +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ >> >> >> +/* Copyright (c) 2008-2019, IBM Corporation */ >> >> >> + >> >> >> +#ifndef _SIW_H >> >> >> +#define _SIW_H >> >> >> + >> >> >> +#include <linux/idr.h> >> >> >> +#include <rdma/ib_verbs.h> >> >> >> +#include <linux/socket.h> >> >> >> +#include <linux/skbuff.h> >> >> >> +#include <linux/in.h> >> >> >> +#include <linux/fs.h> >> >> >> +#include <linux/netdevice.h> >> >> >> +#include <crypto/hash.h> >> >> >> +#include <linux/resource.h> /* MLOCK_LIMIT */ >> >> >> +#include <linux/module.h> >> >> >> +#include <linux/version.h> >> >> >> +#include <linux/llist.h> >> >> >> +#include <linux/mm.h> >> >> >> +#include <linux/sched/signal.h> >> >> >> + >> >> >> +#include <rdma/siw_user.h> >> >> >> +#include "iwarp.h" >> >> >> + >> >> >> +/* driver debugging enabled */ >> >> >> +#define DEBUG >> >> > >> >> >I clearly remember that we asked to remove this. >> >> >> >> Absolutely. Sorry, it sneaked in again since I did some >> >> debugging. Will remove... >> >> > >> >> >> + spinlock_t lock; >> >> >> + >> >> >> + /* object management */ >> >> >> + struct idr qp_idr; >> >> >> + struct idr mem_idr; >> >> > >> >> >Why IDR and not XArray? >> >> >> >> Memory access keys and QP IDs are generated as random >> >> numbers, since both are exposed to the application. >> >> Since XArray is not designed for sparsely distributed >> >> id ranges, I am still in favor of IDR for these two >> >> resources. > >IDR and xarray have identical underlying storage so this is nonsense > >No new idr's or radix tree users will be accepted into rdma.... Use >xarray > Sounds good to me! I just came across that introductory video from Matthew, where he explicitly stated that xarray will be not very efficient if the indices are not densely clustered. But maybe this is all far beyond the 24bits of index space a memory key is in. So let me drop that IDR thing completely, while handling randomized 24 bit memory keys. Thanks Bernard
On Wed, May 08, 2019 at 08:07:59AM +0000, Bernard Metzler wrote: > >> >> Memory access keys and QP IDs are generated as random > >> >> numbers, since both are exposed to the application. > >> >> Since XArray is not designed for sparsely distributed > >> >> id ranges, I am still in favor of IDR for these two > >> >> resources. > > > >IDR and xarray have identical underlying storage so this is nonsense > > > >No new idr's or radix tree users will be accepted into rdma.... Use > >xarray > > > Sounds good to me! I just came across that introductory video from Matthew, > where he explicitly stated that xarray will be not very efficient if the > indices are not densely clustered. But maybe this is all far beyond the > 24bits of index space a memory key is in. So let me drop that IDR thing > completely, while handling randomized 24 bit memory keys. xarray/idr is a poor choice to store highly unclustered random data I'm not sure why this is a problem, shouldn't the driver be in control of mkey assignment? Just use xa_alloc_cyclic and it will be sufficiently clustered to be efficient. Jason
-----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: ----- >To: "Bernard Metzler" <BMT@zurich.ibm.com> >From: "Jason Gunthorpe" <jgg@ziepe.ca> >Date: 05/08/2019 03:08PM >Cc: "Leon Romanovsky" <leon@kernel.org>, linux-rdma@vger.kernel.org >Subject: Re: [PATCH v8 02/12] SIW main include file > >On Wed, May 08, 2019 at 08:07:59AM +0000, Bernard Metzler wrote: >> >> >> Memory access keys and QP IDs are generated as random >> >> >> numbers, since both are exposed to the application. >> >> >> Since XArray is not designed for sparsely distributed >> >> >> id ranges, I am still in favor of IDR for these two >> >> >> resources. >> > >> >IDR and xarray have identical underlying storage so this is >nonsense >> > >> >No new idr's or radix tree users will be accepted into rdma.... >Use >> >xarray >> > >> Sounds good to me! I just came across that introductory video from >Matthew, >> where he explicitly stated that xarray will be not very efficient >if the >> indices are not densely clustered. But maybe this is all far beyond >the >> 24bits of index space a memory key is in. So let me drop that IDR >thing >> completely, while handling randomized 24 bit memory keys. > >xarray/idr is a poor choice to store highly unclustered random data > >I'm not sure why this is a problem, shouldn't the driver be in >control >of mkey assignment? Just use xa_alloc_cyclic and it will be >sufficiently clustered to be efficient. > It is a recommendation to choose a hard to predict memory key (to make it hard for an attacker to guess it). From RFC 5040, sec 8.1.1: An RNIC MUST choose the value of STags in a way difficult to predict. It is RECOMMENDED to sparsely populate them over the full available range. Since I did not want to roll my own bug-prone key based lookup, I chose idr. If you tell me xarray is just as inefficient as idr for sparse index distributions, I'll take xarray. Thanks, Bernard.
On Wed, May 08, 2019 at 02:06:05PM +0000, Bernard Metzler wrote: > > >To: "Bernard Metzler" <BMT@zurich.ibm.com> > >From: "Jason Gunthorpe" <jgg@ziepe.ca> > >Date: 05/08/2019 03:08PM > >Cc: "Leon Romanovsky" <leon@kernel.org>, linux-rdma@vger.kernel.org > >Subject: Re: [PATCH v8 02/12] SIW main include file > > > >On Wed, May 08, 2019 at 08:07:59AM +0000, Bernard Metzler wrote: > >> >> >> Memory access keys and QP IDs are generated as random > >> >> >> numbers, since both are exposed to the application. > >> >> >> Since XArray is not designed for sparsely distributed > >> >> >> id ranges, I am still in favor of IDR for these two > >> >> >> resources. > >> > > >> >IDR and xarray have identical underlying storage so this is > >nonsense > >> > > >> >No new idr's or radix tree users will be accepted into rdma.... > >Use > >> >xarray > >> > > >> Sounds good to me! I just came across that introductory video from > >Matthew, > >> where he explicitly stated that xarray will be not very efficient > >if the > >> indices are not densely clustered. But maybe this is all far beyond > >the > >> 24bits of index space a memory key is in. So let me drop that IDR > >thing > >> completely, while handling randomized 24 bit memory keys. > > > >xarray/idr is a poor choice to store highly unclustered random data > > > >I'm not sure why this is a problem, shouldn't the driver be in > >control > >of mkey assignment? Just use xa_alloc_cyclic and it will be > >sufficiently clustered to be efficient. > > > > It is a recommendation to choose a hard to predict memory > key (to make it hard for an attacker to guess it). From > RFC 5040, sec 8.1.1: > > An RNIC MUST choose the value of STags in a way difficult to > predict. It is RECOMMENDED to sparsely populate them over the > full available range. > > Since I did not want to roll my own bug-prone key based lookup, > I chose idr. If you tell me xarray is just as inefficient as > idr for sparse index distributions, I'll take xarray. Yah, this probably wants to be a RB tree or some other data structure.. But you can leave it as xarray it just wastes memory. Jason
On Wed, May 08, 2019 at 02:06:05PM +0000, Bernard Metzler wrote: > -----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: ----- > > >To: "Bernard Metzler" <BMT@zurich.ibm.com> > >From: "Jason Gunthorpe" <jgg@ziepe.ca> > >Date: 05/08/2019 03:08PM > >Cc: "Leon Romanovsky" <leon@kernel.org>, linux-rdma@vger.kernel.org > >Subject: Re: [PATCH v8 02/12] SIW main include file > > > >On Wed, May 08, 2019 at 08:07:59AM +0000, Bernard Metzler wrote: > >> >> >> Memory access keys and QP IDs are generated as random > >> >> >> numbers, since both are exposed to the application. > >> >> >> Since XArray is not designed for sparsely distributed > >> >> >> id ranges, I am still in favor of IDR for these two > >> >> >> resources. > >> > > >> >IDR and xarray have identical underlying storage so this is > >nonsense > >> > > >> >No new idr's or radix tree users will be accepted into rdma.... > >Use > >> >xarray > >> > > >> Sounds good to me! I just came across that introductory video from > >Matthew, > >> where he explicitly stated that xarray will be not very efficient > >if the > >> indices are not densely clustered. But maybe this is all far beyond > >the > >> 24bits of index space a memory key is in. So let me drop that IDR > >thing > >> completely, while handling randomized 24 bit memory keys. > > > >xarray/idr is a poor choice to store highly unclustered random data > > > >I'm not sure why this is a problem, shouldn't the driver be in > >control > >of mkey assignment? Just use xa_alloc_cyclic and it will be > >sufficiently clustered to be efficient. > > > > It is a recommendation to choose a hard to predict memory > key (to make it hard for an attacker to guess it). From > RFC 5040, sec 8.1.1: > > An RNIC MUST choose the value of STags in a way difficult to > predict. It is RECOMMENDED to sparsely populate them over the > full available range. Nice, security by obscurity, this recommendation is nonsense in real life, protection should be done by separating PDs and not by hiding stags. > > Since I did not want to roll my own bug-prone key based lookup, > I chose idr. If you tell me xarray is just as inefficient as > idr for sparse index distributions, I'll take xarray. > > Thanks, > Bernard. >
-----"Leon Romanovsky" <leon@kernel.org> wrote: ----- >To: "Bernard Metzler" <BMT@zurich.ibm.com> >From: "Leon Romanovsky" <leon@kernel.org> >Date: 05/08/2019 04:22PM >Cc: "Jason Gunthorpe" <jgg@ziepe.ca>, linux-rdma@vger.kernel.org >Subject: Re: [PATCH v8 02/12] SIW main include file > >On Wed, May 08, 2019 at 02:06:05PM +0000, Bernard Metzler wrote: >> -----"Jason Gunthorpe" <jgg@ziepe.ca> wrote: ----- >> >> >To: "Bernard Metzler" <BMT@zurich.ibm.com> >> >From: "Jason Gunthorpe" <jgg@ziepe.ca> >> >Date: 05/08/2019 03:08PM >> >Cc: "Leon Romanovsky" <leon@kernel.org>, >linux-rdma@vger.kernel.org >> >Subject: Re: [PATCH v8 02/12] SIW main include file >> > >> >On Wed, May 08, 2019 at 08:07:59AM +0000, Bernard Metzler wrote: >> >> >> >> Memory access keys and QP IDs are generated as random >> >> >> >> numbers, since both are exposed to the application. >> >> >> >> Since XArray is not designed for sparsely distributed >> >> >> >> id ranges, I am still in favor of IDR for these two >> >> >> >> resources. >> >> > >> >> >IDR and xarray have identical underlying storage so this is >> >nonsense >> >> > >> >> >No new idr's or radix tree users will be accepted into rdma.... >> >Use >> >> >xarray >> >> > >> >> Sounds good to me! I just came across that introductory video >from >> >Matthew, >> >> where he explicitly stated that xarray will be not very >efficient >> >if the >> >> indices are not densely clustered. But maybe this is all far >beyond >> >the >> >> 24bits of index space a memory key is in. So let me drop that >IDR >> >thing >> >> completely, while handling randomized 24 bit memory keys. >> > >> >xarray/idr is a poor choice to store highly unclustered random >data >> > >> >I'm not sure why this is a problem, shouldn't the driver be in >> >control >> >of mkey assignment? Just use xa_alloc_cyclic and it will be >> >sufficiently clustered to be efficient. >> > >> >> It is a recommendation to choose a hard to predict memory >> key (to make it hard for an attacker to guess it). From >> RFC 5040, sec 8.1.1: >> >> An RNIC MUST choose the value of STags in a way difficult to >> predict. It is RECOMMENDED to sparsely populate them over the >> full available range. > >Nice, security by obscurity, this recommendation is nonsense in real >life, >protection should be done by separating PDs and not by hiding stags. > Not sure about that one. Randomizing peer visible references of local protocol state isn't something invented just here. Think of TCP's ISN randomization etc. Maybe best, in the near future, I'll do some RB tree thing for more efficiency. >> >> Since I did not want to roll my own bug-prone key based lookup, >> I chose idr. If you tell me xarray is just as inefficient as >> idr for sparse index distributions, I'll take xarray. >> >> Thanks, >> Bernard. >> > >
On Wed, 2019-05-08 at 17:22 +0300, Leon Romanovsky wrote: > > It is a recommendation to choose a hard to predict memory > > key (to make it hard for an attacker to guess it). From > > RFC 5040, sec 8.1.1: > > > > An RNIC MUST choose the value of STags in a way difficult to > > predict. It is RECOMMENDED to sparsely populate them over the > > full available range. > > Nice, security by obscurity, this recommendation is nonsense in real life, > protection should be done by separating PDs and not by hiding stags. That rather misses the point. The point isn't whether your PDs are separate, but whether a malicious third party can easily guess your next generated ID so it can be used in an attack. This is security by obscurity, it's security by non-guessability, and it's been shown to be necessary multiple times over in network stacks.
On Wed, May 08, 2019 at 11:41:37AM -0400, Doug Ledford wrote: > On Wed, 2019-05-08 at 17:22 +0300, Leon Romanovsky wrote: > > > It is a recommendation to choose a hard to predict memory > > > key (to make it hard for an attacker to guess it). From > > > RFC 5040, sec 8.1.1: > > > > > > An RNIC MUST choose the value of STags in a way difficult to > > > predict. It is RECOMMENDED to sparsely populate them over the > > > full available range. > > > > Nice, security by obscurity, this recommendation is nonsense in real life, > > protection should be done by separating PDs and not by hiding stags. > > That rather misses the point. The point isn't whether your PDs are > separate, but whether a malicious third party can easily guess your next > generated ID so it can be used in an attack. This is security by > obscurity, it's security by non-guessability, and it's been shown to be > necessary multiple times over in network stacks. ok > > -- > Doug Ledford <dledford@redhat.com> > GPG KeyID: B826A3330E572FDD > Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h new file mode 100644 index 000000000000..9a3c2abbd858 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw.h @@ -0,0 +1,733 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_H +#define _SIW_H + +#include <linux/idr.h> +#include <rdma/ib_verbs.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/fs.h> +#include <linux/netdevice.h> +#include <crypto/hash.h> +#include <linux/resource.h> /* MLOCK_LIMIT */ +#include <linux/module.h> +#include <linux/version.h> +#include <linux/llist.h> +#include <linux/mm.h> +#include <linux/sched/signal.h> + +#include <rdma/siw_user.h> +#include "iwarp.h" + +/* driver debugging enabled */ +#define DEBUG + +#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */ +#define SIW_VENDORT_PART_ID 0 +#define SIW_MAX_QP (1024 * 100) +#define SIW_MAX_QP_WR (1024 * 32) +#define SIW_MAX_ORD_QP 128 +#define SIW_MAX_IRD_QP 128 +#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */ +#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */ +#define SIW_MAX_CQ (1024 * 100) +#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100) +#define SIW_MAX_MR (SIW_MAX_QP * 10) +#define SIW_MAX_PD SIW_MAX_QP +#define SIW_MAX_MW 0 /* to be set if MW's are supported */ +#define SIW_MAX_FMR SIW_MAX_MR +#define SIW_MAX_SRQ SIW_MAX_QP +#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10) +#define SIW_MAX_CONTEXT SIW_MAX_PD + +/* Min number of bytes for using zero copy transmit */ +#define SENDPAGE_THRESH PAGE_SIZE + +/* Maximum number of frames which can be send in one SQ processing */ +#define SQ_USER_MAXBURST 100 + +/* Maximum number of consecutive IRQ elements which get served + * if SQ has pending work. Prevents starving local SQ processing + * by serving peer Read Requests. + */ +#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 + +struct siw_dev_cap { + int max_qp; + int max_qp_wr; + int max_ord; /* max. outbound read queue depth */ + int max_ird; /* max. inbound read queue depth */ + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_mw; + int max_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; +}; + +struct siw_device { + struct ib_device base_dev; + struct net_device *netdev; + struct siw_dev_cap attrs; + + u32 vendor_part_id; + int numa_node; + + /* physical port state (only one port per device) */ + enum ib_port_state state; + + spinlock_t lock; + + /* object management */ + struct idr qp_idr; + struct idr mem_idr; + + struct list_head cep_list; + struct list_head qp_list; + struct list_head mr_list; + + /* active objects statistics */ + atomic_t num_qp; + atomic_t num_cq; + atomic_t num_pd; + atomic_t num_mr; + atomic_t num_srq; + atomic_t num_cep; + atomic_t num_ctx; + + struct work_struct netdev_down; +}; + +struct siw_uobj { + void *addr; + u32 size; +}; + +struct siw_ucontext { + struct ib_ucontext base_ucontext; + struct siw_device *sdev; + + /* xarray of user mappable objects */ + struct xarray xa; + u32 uobj_nextkey; +}; + +struct siw_pd { + struct ib_pd base_pd; + struct siw_device *sdev; +}; + +/* + * The RDMA core does not define LOCAL_READ access, which is always + * enabled implictely. + */ +#define IWARP_ACCESS_MASK \ + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | \ + IB_ACCESS_REMOTE_READ) + +struct siw_mr; + +/* + * siw presentation of user memory registered as source + * or target of RDMA operations. + */ + +struct siw_page_chunk { + struct page **p; +}; + +struct siw_umem { + struct siw_page_chunk *page_chunk; + int num_pages; + bool writable; + u64 fp_addr; /* First page base address */ + struct mm_struct *owning_mm; +}; + +struct siw_pble { + u64 addr; /* Address of assigned user buffer */ + u64 size; /* Size of this entry */ + u64 pbl_off; /* Total offset from start of PBL */ +}; + +struct siw_pbl { + unsigned int num_buf; + unsigned int max_buf; + struct siw_pble pbe[1]; +}; + +/* + * Generic memory representation for registered siw memory. + * Memory lookup always via higher 24 bit of STag (STag index). + * Object relates to memory window if embedded mr pointer is valid + */ +struct siw_mem { + struct siw_device *sdev; + struct kref ref; + struct siw_mr *mr; /* assoc. MR if MW, NULL if MR */ + u64 va; /* VA of memory */ + u64 len; /* amount of memory bytes */ + u32 stag; /* iWarp memory access steering tag */ + u8 stag_valid; /* VALID or INVALID */ + u8 is_pbl; /* PBL or user space mem */ + + enum ib_access_flags perms; /* local/remote READ & WRITE */ +}; + +#define SIW_MEM_IS_MW(m) ((m)->mr != NULL) + +/* + * MR and MW definition. + * Used RDMA base structs ib_mr/ib_mw holding: + * lkey, rkey, MW reference count on MR + */ +struct siw_mr { + struct ib_mr base_mr; + struct siw_mem mem; + struct list_head devq; + struct rcu_head rcu; + union { + struct siw_umem *umem; + struct siw_pbl *pbl; + void *mem_obj; + }; + struct siw_pd *pd; +}; + +struct siw_mw { + struct ib_mw base_mw; + struct siw_mem mem; + struct rcu_head rcu; +}; + +/* + * Error codes for local or remote + * access to registered memory + */ +enum siw_access_state { + E_ACCESS_OK = 0, + E_STAG_INVALID, + E_BASE_BOUNDS, + E_ACCESS_PERM, + E_PD_MISMATCH +}; + +enum siw_wr_state { + SIW_WR_IDLE = 0, + SIW_WR_QUEUED = 1, /* processing has not started yet */ + SIW_WR_INPROGRESS = 2 /* initiated processing of the WR */ +}; + +/* The WQE currently being processed (RX or TX) */ +struct siw_wqe { + /* Copy of applications SQE or RQE */ + union { + struct siw_sqe sqe; + struct siw_rqe rqe; + }; + struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */ + enum siw_wr_state wr_status; + enum siw_wc_status wc_status; + u32 bytes; /* total bytes to process */ + u32 processed; /* bytes processed */ +}; + +struct siw_cq { + struct ib_cq base_cq; + struct siw_device *sdev; + spinlock_t lock; + u64 *notify; + struct siw_cqe *queue; + u32 cq_put; + u32 cq_get; + u32 num_cqe; + bool kernel_verbs; + u32 xa_cq_index; /* mmap information for CQE array */ + u32 id; /* For debugging only */ +}; + +enum siw_qp_state { + SIW_QP_STATE_IDLE = 0, + SIW_QP_STATE_RTR = 1, + SIW_QP_STATE_RTS = 2, + SIW_QP_STATE_CLOSING = 3, + SIW_QP_STATE_TERMINATE = 4, + SIW_QP_STATE_ERROR = 5, + SIW_QP_STATE_COUNT = 6 +}; + +enum siw_qp_flags { + SIW_RDMA_BIND_ENABLED = (1 << 0), + SIW_RDMA_WRITE_ENABLED = (1 << 1), + SIW_RDMA_READ_ENABLED = (1 << 2), + SIW_SIGNAL_ALL_WR = (1 << 3), + SIW_MPA_CRC = (1 << 4), + SIW_QP_IN_DESTROY = (1 << 5) +}; + +enum siw_qp_attr_mask { + SIW_QP_ATTR_STATE = (1 << 0), + SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1), + SIW_QP_ATTR_LLP_HANDLE = (1 << 2), + SIW_QP_ATTR_ORD = (1 << 3), + SIW_QP_ATTR_IRD = (1 << 4), + SIW_QP_ATTR_SQ_SIZE = (1 << 5), + SIW_QP_ATTR_RQ_SIZE = (1 << 6), + SIW_QP_ATTR_MPA = (1 << 7) +}; + +struct siw_sk_upcalls { + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk, int bytes); + void (*sk_write_space)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); +}; + +struct siw_srq { + struct ib_srq base_srq; + struct siw_pd *pd; + spinlock_t lock; + u32 max_sge; + u32 limit; /* low watermark for async event */ + struct siw_rqe *recvq; + u32 rq_put; + u32 rq_get; + u32 num_rqe; /* max # of wqe's allowed */ + u32 xa_srq_index; /* mmap information for SRQ array */ + char armed; /* inform user if limit hit */ + char kernel_verbs; /* '1' if kernel client */ +}; + +struct siw_qp_attrs { + enum siw_qp_state state; + u32 sq_size; + u32 rq_size; + u32 orq_size; + u32 irq_size; + u32 sq_max_sges; + u32 rq_max_sges; + enum siw_qp_flags flags; + + struct socket *sk; +}; + +enum siw_tx_ctx { + SIW_SEND_HDR = 0, /* start or continue sending HDR */ + SIW_SEND_DATA = 1, /* start or continue sending DDP payload */ + SIW_SEND_TRAILER = 2, /* start or continue sending TRAILER */ + SIW_SEND_SHORT_FPDU = 3 /* send whole FPDU hdr|data|trailer at once */ +}; + +enum siw_rx_state { + SIW_GET_HDR = 0, /* await new hdr or within hdr */ + SIW_GET_DATA_START = 1, /* start of inbound DDP payload */ + SIW_GET_DATA_MORE = 2, /* continuation of (misaligned) DDP payload */ + SIW_GET_TRAILER = 3 /* await new trailer or within trailer */ +}; + +struct siw_iwarp_rx { + struct sk_buff *skb; + union iwarp_hdr hdr; + struct mpa_trailer trailer; + /* + * local destination memory of inbound iwarp operation. + * valid, according to wqe->wr_status + */ + struct siw_wqe wqe_active; + + struct shash_desc *mpa_crc_hd; + /* + * Next expected DDP MSN for each QN + + * expected steering tag + + * expected DDP tagget offset (all HBO) + */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + u32 ddp_stag; + u64 ddp_to; + + /* + * For each FPDU, main RX loop runs through 3 stages: + * Receiving protocol headers, placing DDP payload and receiving + * trailer information (CRC + eventual padding). + * Next two variables keep state on receive status of the + * current FPDU part (hdr, data, trailer). + */ + int fpdu_part_rcvd; /* bytes in pkt part copied */ + int fpdu_part_rem; /* bytes in pkt part not seen */ + + int skb_new; /* pending unread bytes in skb */ + int skb_offset; /* offset in skb */ + int skb_copied; /* processed bytes in skb */ + + int pbl_idx; /* Index into current PBL */ + + int sge_idx; /* current sge in rx */ + unsigned int sge_off; /* already rcvd in curr. sge */ + + enum siw_rx_state state; + + u32 inval_stag; /* Stag to be invalidated */ + + u8 first_ddp_seg : 1, /* this is first DDP seg */ + more_ddp_segs : 1, /* more DDP segs expected */ + rx_suspend : 1, /* stop rcv DDP segs. */ + unused : 1, prev_rdmap_opcode : 4; /* opcode of prev msg */ + char pad; /* # of pad bytes expected */ +}; + +#define siw_rx_data(qp, ctx) \ + (iwarp_pktinfo[__rdmap_get_opcode(&ctx->hdr.ctrl)].proc_data(qp, ctx)) + +/* + * Shorthands for short packets w/o payload + * to be transmitted more efficient. + */ +struct siw_send_pkt { + struct iwarp_send send; + __be32 crc; +}; + +struct siw_write_pkt { + struct iwarp_rdma_write write; + __be32 crc; +}; + +struct siw_rreq_pkt { + struct iwarp_rdma_rreq rreq; + __be32 crc; +}; + +struct siw_rresp_pkt { + struct iwarp_rdma_rresp rresp; + __be32 crc; +}; + +struct siw_iwarp_tx { + union { + union iwarp_hdr hdr; + + /* Generic part of FPDU header */ + struct iwarp_ctrl ctrl; + struct iwarp_ctrl_untagged c_untagged; + struct iwarp_ctrl_tagged c_tagged; + + /* FPDU headers */ + struct iwarp_rdma_write rwrite; + struct iwarp_rdma_rreq rreq; + struct iwarp_rdma_rresp rresp; + struct iwarp_terminate terminate; + struct iwarp_send send; + struct iwarp_send_inv send_inv; + + /* complete short FPDUs */ + struct siw_send_pkt send_pkt; + struct siw_write_pkt write_pkt; + struct siw_rreq_pkt rreq_pkt; + struct siw_rresp_pkt rresp_pkt; + } pkt; + + struct mpa_trailer trailer; + /* DDP MSN for untagged messages */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + + enum siw_tx_ctx state; + u16 ctrl_len; /* ddp+rdmap hdr */ + u16 ctrl_sent; + int burst; + int bytes_unsent; /* ddp payload bytes */ + + struct shash_desc *mpa_crc_hd; + + u8 do_crc : 1, /* do crc for segment */ + use_sendpage : 1, /* send w/o copy */ + tx_suspend : 1, /* stop sending DDP segs. */ + pad : 2, /* # pad in current fpdu */ + orq_fence : 1, /* ORQ full or Send fenced */ + in_syscall : 1, /* TX out of user context */ + zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */ + u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */ + + u16 fpdu_len; /* len of FPDU to tx */ + unsigned int tcp_seglen; /* remaining tcp seg space */ + + struct siw_wqe wqe_active; + + int pbl_idx; /* Index into current PBL */ + int sge_idx; /* current sge in tx */ + u32 sge_off; /* already sent in curr. sge */ +}; + +struct siw_qp { + struct ib_qp base_qp; + struct siw_device *sdev; + struct kref ref; + struct list_head devq; + int tx_cpu; + bool kernel_verbs; + struct siw_qp_attrs attrs; + + struct siw_cep *cep; + struct rw_semaphore state_lock; + + struct siw_pd *pd; + struct siw_cq *scq; + struct siw_cq *rcq; + struct siw_srq *srq; + + struct siw_iwarp_tx tx_ctx; /* Transmit context */ + spinlock_t sq_lock; + struct siw_sqe *sendq; /* send queue element array */ + uint32_t sq_get; /* consumer index into sq array */ + uint32_t sq_put; /* kernel prod. index into sq array */ + struct llist_node tx_list; + + struct siw_sqe *orq; /* outbound read queue element array */ + spinlock_t orq_lock; + uint32_t orq_get; /* consumer index into orq array */ + uint32_t orq_put; /* shared producer index for ORQ */ + + struct siw_iwarp_rx rx_ctx; /* Receive context */ + spinlock_t rq_lock; + struct siw_rqe *recvq; /* recv queue element array */ + uint32_t rq_get; /* consumer index into rq array */ + uint32_t rq_put; /* kernel prod. index into rq array */ + + struct siw_sqe *irq; /* inbound read queue element array */ + uint32_t irq_get; /* consumer index into irq array */ + uint32_t irq_put; /* producer index into irq array */ + int irq_burst; + + struct { /* information to be carried in TERMINATE pkt, if valid */ + u8 valid; + u8 in_tx; + u8 layer : 4, etype : 4; + u8 ecode; + } term_info; + u32 xa_sq_index; /* mmap information for SQE array */ + u32 xa_rq_index; /* mmap information for RQE array */ + u32 id; /* ID for debugging only */ +}; + +/* helper macros */ +#define rx_qp(rx) container_of(rx, struct siw_qp, rx_ctx) +#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx) +#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active) +#define rx_wqe(qp) (&(qp)->rx_ctx.wqe_active) +#define rx_mem(qp) ((qp)->rx_ctx.wqe_active.mem[0]) +#define tx_type(wqe) ((wqe)->sqe.opcode) +#define rx_type(wqe) ((wqe)->rqe.opcode) +#define tx_flags(wqe) ((wqe)->sqe.flags) + +struct iwarp_msg_info { + int hdr_len; + struct iwarp_ctrl ctrl; + int (*proc_data)(struct siw_qp *qp, struct siw_iwarp_rx *rctx); +}; + +/* Global siw parameters. Currently set in siw_main.c */ +extern const bool zcopy_tx; +extern const u8 gso_seg_limit; +extern const bool loopback_enabled; +extern const bool mpa_crc_required; +extern const bool mpa_crc_strict; +extern const bool siw_tcp_nagle; +extern u_char mpa_version; +extern const bool peer_to_peer; + +extern struct crypto_shash *siw_crypto_shash; +extern struct task_struct *siw_tx_thread[]; +extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1]; + +/* QP general functions */ +extern int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr, + enum siw_qp_attr_mask mask); +extern int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl); +extern void siw_qp_llp_close(struct siw_qp *qp); +extern void siw_qp_cm_drop(struct siw_qp *qp, int schedule); +extern void siw_send_terminate(struct siw_qp *qp); + +extern struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id); +extern void siw_qp_get_ref(struct ib_qp *qp); +extern void siw_qp_put_ref(struct ib_qp *qp); +extern int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp); +extern void siw_free_qp(struct kref *ref); + +extern void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, + u8 etype, u8 ecode, int in_tx); +extern enum ddp_ecode siw_tagged_error(enum siw_access_state state); +extern enum rdmap_ecode siw_rdmap_error(enum siw_access_state state); + +extern void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe); +extern int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes, + enum siw_wc_status status); +extern int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes, + u32 inval_stag, enum siw_wc_status status); +extern void siw_qp_llp_data_ready(struct sock *sk); +extern void siw_qp_llp_write_space(struct sock *sk); + +/* QP TX path functions */ +extern int siw_run_sq(void *arg); +extern int siw_qp_sq_process(struct siw_qp *qp); +extern int siw_sq_start(struct siw_qp *qp); +extern int siw_activate_tx(struct siw_qp *qp); +extern void siw_stop_tx_thread(int nr_cpu); +extern int siw_get_tx_cpu(struct siw_device *sdev); +extern void siw_put_tx_cpu(int cpu); + +/* QP RX path functions */ +extern int siw_proc_send(struct siw_qp *qp, struct siw_iwarp_rx *rctx); +extern int siw_proc_rreq(struct siw_qp *qp, struct siw_iwarp_rx *rctx); +extern int siw_proc_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx); +extern int siw_proc_write(struct siw_qp *qp, struct siw_iwarp_rx *rctx); +extern int siw_proc_terminate(struct siw_qp *qp, struct siw_iwarp_rx *rctx); + +extern int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len); + +static inline int siw_crc_array(struct shash_desc *desc, u8 *start, size_t len) +{ + return crypto_shash_update(desc, start, len); +} + +static inline int siw_crc_page(struct shash_desc *desc, struct page *p, int off, + int len) +{ + return crypto_shash_update(desc, page_address(p) + off, len); +} + +static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx) +{ + return container_of(base_ctx, struct siw_ucontext, base_ucontext); +} + +static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp) +{ + return container_of(base_qp, struct siw_qp, base_qp); +} + +static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq) +{ + return container_of(base_cq, struct siw_cq, base_cq); +} + +static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq) +{ + return container_of(base_srq, struct siw_srq, base_srq); +} + +static inline struct siw_device *to_siw_dev(struct ib_device *base_dev) +{ + return container_of(base_dev, struct siw_device, base_dev); +} + +static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr) +{ + return container_of(base_mr, struct siw_mr, base_mr); +} + +static inline struct siw_pd *to_siw_pd(struct ib_pd *base_pd) +{ + return container_of(base_pd, struct siw_pd, base_pd); +} + +static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id) +{ + struct siw_qp *qp = idr_find(&sdev->qp_idr, id); + + if (likely(qp && kref_get_unless_zero(&qp->ref))) + return qp; + + return NULL; +} + +static inline void siw_qp_get(struct siw_qp *qp) +{ + kref_get(&qp->ref); +} + +static inline void siw_qp_put(struct siw_qp *qp) +{ + kref_put(&qp->ref, siw_free_qp); +} + +static inline int siw_sq_empty(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + return READ_ONCE(sqe->flags) == 0; +} + +static inline struct siw_sqe *sq_get_next(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + if (READ_ONCE(sqe->flags) & SIW_WQE_VALID) + return sqe; + + return NULL; +} + +static inline struct siw_sqe *orq_get_current(struct siw_qp *qp) +{ + return &qp->orq[qp->orq_get % qp->attrs.orq_size]; +} + +static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp) +{ + if (likely(qp->attrs.orq_size)) + return &qp->orq[qp->orq_put % qp->attrs.orq_size]; + + pr_warn("siw: QP[%d]: ORQ has zero length", qp->id); + return NULL; +} + +static inline struct siw_sqe *orq_get_free(struct siw_qp *qp) +{ + struct siw_sqe *orq_e = orq_get_tail(qp); + + if (orq_e && READ_ONCE(orq_e->flags) == 0) + return orq_e; + + return NULL; +} + +static inline int siw_orq_empty(struct siw_qp *qp) +{ + return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0; +} + +static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp) +{ + struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size]; + + if (READ_ONCE(irq_e->flags) == 0) { + qp->irq_put++; + return irq_e; + } + return NULL; +} + +static inline struct siw_mr *siw_mem2mr(struct siw_mem *m) +{ + if (!SIW_MEM_IS_MW(m)) + return container_of(m, struct siw_mr, mem); + return m->mr; +} + +/* Varia */ +extern void siw_cq_flush(struct siw_cq *cq); +extern void siw_sq_flush(struct siw_qp *qp); +extern void siw_rq_flush(struct siw_qp *qp); +extern int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); +extern void siw_print_hdr(union iwarp_hdr *hdr, int qp_id, char *string); +#endif