diff mbox series

[for-next,v2,07/18] RDMA/rxe: Make task interface pluggable

Message ID 20221021200118.2163-8-rpearsonhpe@gmail.com (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series Implement work queues for rdma_rxe | expand

Commit Message

Bob Pearson Oct. 21, 2022, 8:01 p.m. UTC
Make the internal interface to the task operations pluggable and
add a new 'inline' type.

Signed-off-by: Ian Ziemba <ian.ziemba@hpe.com>
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_qp.c   |   8 +-
 drivers/infiniband/sw/rxe/rxe_task.c | 160 ++++++++++++++++++++++-----
 drivers/infiniband/sw/rxe/rxe_task.h |  44 +++++---
 3 files changed, 165 insertions(+), 47 deletions(-)

Comments

Daisuke Matsuda (Fujitsu) Oct. 25, 2022, 8:02 a.m. UTC | #1
On Sat, Oct 22, 2022 5:01 AM Bob Pearson wrote:
> 
> Make the internal interface to the task operations pluggable and
> add a new 'inline' type.

I do not see why we need the new 'inline' type. It may be technically
possible to add it, but is there any situation where it is useful?
With the new mode, It will take longer for softirq (NET_RX_SOFTIRQ)
to complete its work, and that can cause a negative effect on the system
as a whole.


> 
> Signed-off-by: Ian Ziemba <ian.ziemba@hpe.com>
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_qp.c   |   8 +-
>  drivers/infiniband/sw/rxe/rxe_task.c | 160 ++++++++++++++++++++++-----
>  drivers/infiniband/sw/rxe/rxe_task.h |  44 +++++---
>  3 files changed, 165 insertions(+), 47 deletions(-)

<...>

> +
> +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *),
> +		  enum rxe_task_type type)
> +{
> +	task->arg	= arg;
> +	task->func	= func;
> +	task->destroyed	= false;
> +	task->type	= type;
> +	task->state	= TASK_STATE_START;
> +
> +	spin_lock_init(&task->lock);
> +
> +	switch (type) {
> +	case RXE_TASK_TYPE_INLINE:
> +		inline_init(task);
> +		break;
> +	case RXE_TASK_TYPE_TASKLET:
> +		tsklet_init(task);
> +		break;
> +	default:
> +		pr_debug("%s: invalid task type = %d\n", __func__, type);
> +		return -EINVAL;

If this 'default' label is executed, task->ops is left unassigned, and the system 
crashes later. When I executed test_atomic_cmp_and_swap (a testcase in rdma-core),
my system crashed with the following backtrace:
=====
[ 2276.212255] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 2276.214319] #PF: supervisor read access in kernel mode
[ 2276.215774] #PF: error_code(0x0000) - not-present page
[ 2276.217050] PGD 0 P4D 0
[ 2276.217558] Oops: 0000 [#1] PREEMPT SMP PTI
[ 2276.218352] CPU: 2 PID: 6352 Comm: python3 Kdump: loaded Not tainted 6.1.0-rc1+ #2
[ 2276.219771] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014
[ 2276.221466] RIP: 0010:rxe_run_task+0x9/0x20 [rdma_rxe]
[ 2276.222467] Code: 00 0f 1f 44 00 00 48 8b 47 48 48 8b 00 e9 ff 29 9d d4 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 48 8b 47 48 <48> 8b 40 08 e9 de 29 9d d4 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f
[ 2276.225940] RSP: 0018:ffffaadbc065fcf0 EFLAGS: 00010282
[ 2276.226924] RAX: 0000000000000000 RBX: 00007fff83c97500 RCX: 0000000000000003
[ 2276.228247] RDX: ffffaadbc065fd48 RSI: 0000000000000000 RDI: ffff99a4f0b7c3e0
[ 2276.229578] RBP: 0000000000000000 R08: 0000000000000402 R09: ffff99a4d5aa0400
[ 2276.230901] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[ 2276.232223] R13: ffffaadbc065fdd8 R14: 0000000000000003 R15: ffff99a4d94c4c00
[ 2276.233556] FS:  00007f65a2a68740(0000) GS:ffff99a6f5c80000(0000) knlGS:0000000000000000
[ 2276.235056] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2276.236133] CR2: 0000000000000008 CR3: 000000012807a002 CR4: 0000000000060ee0
[ 2276.237468] Call Trace:
[ 2276.237965]  <TASK>
[ 2276.238385]  rxe_post_send+0x2d/0x50 [rdma_rxe]
[ 2276.239260]  ib_uverbs_post_send+0x5f8/0x680 [ib_uverbs]
[ 2276.240363]  ? __mod_memcg_lruvec_state+0x41/0x90
[ 2276.241311]  ib_uverbs_write+0x3c8/0x500 [ib_uverbs]
[ 2276.242265]  vfs_write+0xc5/0x3b0
[ 2276.242964]  ? handle_mm_fault+0xc5/0x2b0
[ 2276.243758]  ksys_write+0xab/0xe0
[ 2276.244398]  ? syscall_trace_enter.constprop.0+0x126/0x1a0
[ 2276.245477]  do_syscall_64+0x3b/0x90
[ 2276.246198]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 2276.247243] RIP: 0033:0x7f65a233e967
[ 2276.247931] Code: 0b 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
[ 2276.251370] RSP: 002b:00007fff83c974e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 2276.252785] RAX: ffffffffffffffda RBX: 000055ca9f506410 RCX: 00007f65a233e967
[ 2276.254108] RDX: 0000000000000020 RSI: 00007fff83c97500 RDI: 0000000000000003
[ 2276.255431] RBP: 0000000000000000 R08: 0000000000000000 R09: 00007f65a293cd80
[ 2276.256762] R10: 00007f65a1edada0 R11: 0000000000000246 R12: 0000000000000008
[ 2276.258086] R13: 00007f65a2b85180 R14: 0000000000000000 R15: 0000000000000003
[ 2276.259413]  </TASK>
[ 2276.259851] Modules linked in: rpcrdma rdma_ucm ib_srpt ib_isert iscsi_target_mod target_core_mod ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm rdma_rxe ib_uverbs ip6_udp_tunnel udp_tunnel ib_core rfkill sunrpc intel_rapl_msr intel_rapl_common kvm_intel kvm irqbypass nd_pmem nd_btt dax_pmem joydev virtio_balloon i2c_piix4 pcspkr drm xfs libcrc32c sd_mod t10_pi sr_mod crc64_rocksoft_generic cdrom crc64_rocksoft crc64 sg ata_generic crct10dif_pclmul ata_piix crc32_pclmul nd_e820 crc32c_intel serio_raw libnvdimm ghash_clmulni_intel libata virtio_net e1000 sha512_ssse3 net_failover virtio_console failover dm_mirror dm_region_hash dm_log dm_mod fuse
[ 2276.270673] CR2: 0000000000000008
=====

The callers of rxe_init_task() do not get the returned value, and they incorrectly
report success to the upper layer. You need to modify rxe_qp_init_resp() and
rxe_qp_init_req() to handle the error properly. Additionally, I think it is better
to use pr_warn() here instead of pr_debug() to let users notice the failure.

Thanks,
Daisuke

> +	}
> +
> +	return 0;
> +}
> +
> +void rxe_sched_task(struct rxe_task *task)
> +{
> +	task->ops->sched(task);
> +}
> +
> +void rxe_run_task(struct rxe_task *task)
> +{
> +	task->ops->run(task);
> +}
> +
> +void rxe_enable_task(struct rxe_task *task)
> +{
> +	task->ops->enable(task);
> +}
> +
> +void rxe_disable_task(struct rxe_task *task)
> +{
> +	task->ops->disable(task);
> +}
> +
> +void rxe_cleanup_task(struct rxe_task *task)
> +{
> +	task->ops->cleanup(task);
>  }
> diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
> index 7b88129702ac..31963129ff7a 100644
> --- a/drivers/infiniband/sw/rxe/rxe_task.h
> +++ b/drivers/infiniband/sw/rxe/rxe_task.h
> @@ -7,6 +7,21 @@
>  #ifndef RXE_TASK_H
>  #define RXE_TASK_H
> 
> +struct rxe_task;
> +
> +struct rxe_task_ops {
> +	void (*sched)(struct rxe_task *task);
> +	void (*run)(struct rxe_task *task);
> +	void (*disable)(struct rxe_task *task);
> +	void (*enable)(struct rxe_task *task);
> +	void (*cleanup)(struct rxe_task *task);
> +};
> +
> +enum rxe_task_type {
> +	RXE_TASK_TYPE_INLINE	= 0,
> +	RXE_TASK_TYPE_TASKLET	= 1,
> +};
> +
>  enum {
>  	TASK_STATE_START	= 0,
>  	TASK_STATE_BUSY		= 1,
> @@ -19,24 +34,19 @@ enum {
>   * called again.
>   */
>  struct rxe_task {
> -	struct tasklet_struct	tasklet;
> -	int			state;
> -	spinlock_t		lock;
> -	void			*arg;
> -	int			(*func)(void *arg);
> -	int			ret;
> -	bool			destroyed;
> +	struct tasklet_struct		tasklet;
> +	int				state;
> +	spinlock_t			lock;
> +	void				*arg;
> +	int				(*func)(void *arg);
> +	int				ret;
> +	bool				destroyed;
> +	const struct rxe_task_ops	*ops;
> +	enum rxe_task_type		type;
>  };
> 
> -/*
> - * init rxe_task structure
> - *	arg  => parameter to pass to fcn
> - *	func => function to call until it returns != 0
> - */
> -int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *));
> -
> -/* cleanup task */
> -void rxe_cleanup_task(struct rxe_task *task);
> +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *),
> +		  enum rxe_task_type type);
> 
>  /*
>   * raw call to func in loop without any checking
> @@ -54,4 +64,6 @@ void rxe_disable_task(struct rxe_task *task);
>  /* allow task to run */
>  void rxe_enable_task(struct rxe_task *task);
> 
> +void rxe_cleanup_task(struct rxe_task *task);
> +
>  #endif /* RXE_TASK_H */
> --
> 2.34.1
Bob Pearson Oct. 25, 2022, 2:16 p.m. UTC | #2
On 10/25/22 03:02, Daisuke Matsuda (Fujitsu) wrote:
> On Sat, Oct 22, 2022 5:01 AM Bob Pearson wrote:
>>
>> Make the internal interface to the task operations pluggable and
>> add a new 'inline' type.
> 
> I do not see why we need the new 'inline' type. It may be technically
> possible to add it, but is there any situation where it is useful?
> With the new mode, It will take longer for softirq (NET_RX_SOFTIRQ)
> to complete its work, and that can cause a negative effect on the system
> as a whole.
> 
The one and only place where it makes sense in production is for the completer
task for UD QPs. In the original rxe code it was called inline from the requester.
Later that sort of got blurred and sometimes it was called inline and sometimes as
a tasklet. For benchmarking and testing it is useful to have other options to
check. In my world, HPC, storage use cases are typically all consuming when active
and we will try anything to see what gives the best performance and not worry too
much about the effect on other jobs. I am also interested in trying threaded interrupts
which are described as another alternative to tasklets.

Bob
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 3f6d62a80bea..b5e108794aa1 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -238,8 +238,10 @@  static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
 
 	skb_queue_head_init(&qp->req_pkts);
 
-	rxe_init_task(&qp->req.task, qp, rxe_requester);
-	rxe_init_task(&qp->comp.task, qp, rxe_completer);
+	rxe_init_task(&qp->req.task, qp, rxe_requester, RXE_TASK_TYPE_TASKLET);
+	rxe_init_task(&qp->comp.task, qp, rxe_completer,
+			(qp_type(qp) == IB_QPT_RC) ? RXE_TASK_TYPE_TASKLET :
+						     RXE_TASK_TYPE_INLINE);
 
 	qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
 	if (init->qp_type == IB_QPT_RC) {
@@ -286,7 +288,7 @@  static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
 
 	skb_queue_head_init(&qp->resp_pkts);
 
-	rxe_init_task(&qp->resp.task, qp, rxe_responder);
+	rxe_init_task(&qp->resp.task, qp, rxe_responder, RXE_TASK_TYPE_TASKLET);
 
 	qp->resp.opcode		= OPCODE_NONE;
 	qp->resp.msn		= 0;
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
index 0208d833a41b..8dfbfa164eff 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.c
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -24,12 +24,11 @@  int __rxe_do_task(struct rxe_task *task)
  * a second caller finds the task already running
  * but looks just after the last call to func
  */
-static void do_task(struct tasklet_struct *t)
+static void do_task(struct rxe_task *task)
 {
+	unsigned int iterations = RXE_MAX_ITERATIONS;
 	int cont;
 	int ret;
-	struct rxe_task *task = from_tasklet(task, t, tasklet);
-	unsigned int iterations = RXE_MAX_ITERATIONS;
 
 	spin_lock_bh(&task->lock);
 	switch (task->state) {
@@ -90,28 +89,21 @@  static void do_task(struct tasklet_struct *t)
 	task->ret = ret;
 }
 
-int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *))
+static void disable_task(struct rxe_task *task)
 {
-	task->arg	= arg;
-	task->func	= func;
-	task->destroyed	= false;
-
-	tasklet_setup(&task->tasklet, do_task);
-
-	task->state = TASK_STATE_START;
-	spin_lock_init(&task->lock);
+	/* todo */
+}
 
-	return 0;
+static void enable_task(struct rxe_task *task)
+{
+	/* todo */
 }
 
-void rxe_cleanup_task(struct rxe_task *task)
+/* busy wait until any previous tasks are done */
+static void cleanup_task(struct rxe_task *task)
 {
 	bool idle;
 
-	/*
-	 * Mark the task, then wait for it to finish. It might be
-	 * running in a non-tasklet (direct call) context.
-	 */
 	task->destroyed = true;
 
 	do {
@@ -119,32 +111,144 @@  void rxe_cleanup_task(struct rxe_task *task)
 		idle = (task->state == TASK_STATE_START);
 		spin_unlock_bh(&task->lock);
 	} while (!idle);
+}
 
-	tasklet_kill(&task->tasklet);
+/* silently treat schedule as inline for inline tasks */
+static void inline_sched(struct rxe_task *task)
+{
+	do_task(task);
 }
 
-void rxe_run_task(struct rxe_task *task)
+static void inline_run(struct rxe_task *task)
 {
-	if (task->destroyed)
-		return;
+	do_task(task);
+}
 
-	do_task(&task->tasklet);
+static void inline_disable(struct rxe_task *task)
+{
+	disable_task(task);
 }
 
-void rxe_sched_task(struct rxe_task *task)
+static void inline_enable(struct rxe_task *task)
 {
-	if (task->destroyed)
-		return;
+	enable_task(task);
+}
+
+static void inline_cleanup(struct rxe_task *task)
+{
+	cleanup_task(task);
+}
+
+static const struct rxe_task_ops inline_ops = {
+	.sched = inline_sched,
+	.run = inline_run,
+	.enable = inline_enable,
+	.disable = inline_disable,
+	.cleanup = inline_cleanup,
+};
 
+static void inline_init(struct rxe_task *task)
+{
+	task->ops = &inline_ops;
+}
+
+/* use tsklet_xxx to avoid name collisions with tasklet_xxx */
+static void tsklet_sched(struct rxe_task *task)
+{
 	tasklet_schedule(&task->tasklet);
 }
 
-void rxe_disable_task(struct rxe_task *task)
+static void tsklet_do_task(struct tasklet_struct *tasklet)
 {
+	struct rxe_task *task = container_of(tasklet, typeof(*task), tasklet);
+
+	do_task(task);
+}
+
+static void tsklet_run(struct rxe_task *task)
+{
+	do_task(task);
+}
+
+static void tsklet_disable(struct rxe_task *task)
+{
+	disable_task(task);
 	tasklet_disable(&task->tasklet);
 }
 
-void rxe_enable_task(struct rxe_task *task)
+static void tsklet_enable(struct rxe_task *task)
 {
 	tasklet_enable(&task->tasklet);
+	enable_task(task);
+}
+
+static void tsklet_cleanup(struct rxe_task *task)
+{
+	cleanup_task(task);
+	tasklet_kill(&task->tasklet);
+}
+
+static const struct rxe_task_ops tsklet_ops = {
+	.sched = tsklet_sched,
+	.run = tsklet_run,
+	.enable = tsklet_enable,
+	.disable = tsklet_disable,
+	.cleanup = tsklet_cleanup,
+};
+
+static void tsklet_init(struct rxe_task *task)
+{
+	tasklet_setup(&task->tasklet, tsklet_do_task);
+	task->ops = &tsklet_ops;
+}
+
+int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *),
+		  enum rxe_task_type type)
+{
+	task->arg	= arg;
+	task->func	= func;
+	task->destroyed	= false;
+	task->type	= type;
+	task->state	= TASK_STATE_START;
+
+	spin_lock_init(&task->lock);
+
+	switch (type) {
+	case RXE_TASK_TYPE_INLINE:
+		inline_init(task);
+		break;
+	case RXE_TASK_TYPE_TASKLET:
+		tsklet_init(task);
+		break;
+	default:
+		pr_debug("%s: invalid task type = %d\n", __func__, type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void rxe_sched_task(struct rxe_task *task)
+{
+	task->ops->sched(task);
+}
+
+void rxe_run_task(struct rxe_task *task)
+{
+	task->ops->run(task);
+}
+
+void rxe_enable_task(struct rxe_task *task)
+{
+	task->ops->enable(task);
+}
+
+void rxe_disable_task(struct rxe_task *task)
+{
+	task->ops->disable(task);
+}
+
+void rxe_cleanup_task(struct rxe_task *task)
+{
+	task->ops->cleanup(task);
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
index 7b88129702ac..31963129ff7a 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.h
+++ b/drivers/infiniband/sw/rxe/rxe_task.h
@@ -7,6 +7,21 @@ 
 #ifndef RXE_TASK_H
 #define RXE_TASK_H
 
+struct rxe_task;
+
+struct rxe_task_ops {
+	void (*sched)(struct rxe_task *task);
+	void (*run)(struct rxe_task *task);
+	void (*disable)(struct rxe_task *task);
+	void (*enable)(struct rxe_task *task);
+	void (*cleanup)(struct rxe_task *task);
+};
+
+enum rxe_task_type {
+	RXE_TASK_TYPE_INLINE	= 0,
+	RXE_TASK_TYPE_TASKLET	= 1,
+};
+
 enum {
 	TASK_STATE_START	= 0,
 	TASK_STATE_BUSY		= 1,
@@ -19,24 +34,19 @@  enum {
  * called again.
  */
 struct rxe_task {
-	struct tasklet_struct	tasklet;
-	int			state;
-	spinlock_t		lock;
-	void			*arg;
-	int			(*func)(void *arg);
-	int			ret;
-	bool			destroyed;
+	struct tasklet_struct		tasklet;
+	int				state;
+	spinlock_t			lock;
+	void				*arg;
+	int				(*func)(void *arg);
+	int				ret;
+	bool				destroyed;
+	const struct rxe_task_ops	*ops;
+	enum rxe_task_type		type;
 };
 
-/*
- * init rxe_task structure
- *	arg  => parameter to pass to fcn
- *	func => function to call until it returns != 0
- */
-int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *));
-
-/* cleanup task */
-void rxe_cleanup_task(struct rxe_task *task);
+int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *),
+		  enum rxe_task_type type);
 
 /*
  * raw call to func in loop without any checking
@@ -54,4 +64,6 @@  void rxe_disable_task(struct rxe_task *task);
 /* allow task to run */
 void rxe_enable_task(struct rxe_task *task);
 
+void rxe_cleanup_task(struct rxe_task *task);
+
 #endif /* RXE_TASK_H */