diff mbox

[v2,2/3] libiscsi, iser: Adjust data_length to include protection information

Message ID 1402477799-24610-3-git-send-email-sagig@mellanox.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Sagi Grimberg June 11, 2014, 9:09 a.m. UTC
In case protection information exists over the wire
iscsi header data length is required to include it.
Use protection information aware scsi helpers to set
the correct transfer length.

In order to avoid breakage, remove iser transfer length
checks for each task as they are not always true and
somewhat redundant anyway.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Mike Christie <michaelc@cs.wisc.edu>
---
 drivers/infiniband/ulp/iser/iser_initiator.c |   34 +++++++------------------
 drivers/scsi/libiscsi.c                      |   18 +++++++-------
 2 files changed, 19 insertions(+), 33 deletions(-)

Comments

Christoph Hellwig June 23, 2014, 8:59 p.m. UTC | #1
This patch causes a regression when using the iscsi initiator over
TCP for me. When mounting a newly created ext4 filesystem I get the
following BUG: 

[   31.611803] BUG: unable to handle kernel NULL pointer dereference at 000000000000000c
[   31.613563] IP: [<ffffffff8197b38d>] iscsi_tcp_segment_done+0x2bd/0x380
[   31.613563] PGD 7a3e4067 PUD 7a45f067 PMD 0 
[   31.613563] Oops: 0000 [#1] SMP 
[   31.613563] Modules linked in:
[   31.613563] CPU: 3 PID: 3739 Comm: kworker/u8:5 Not tainted 3.16.0-rc2 #187
[   31.613563] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[   31.613563] Workqueue: iscsi_q_2 iscsi_xmitworker
[   31.613563] task: ffff88007b33cf10 ti: ffff88007ad94000 task.ti: ffff88007ad94000
[   31.613563] RIP: 0010:[<ffffffff8197b38d>]  [<ffffffff8197b38d>] iscsi_tcp_segment_done+0x2bd/0x380
[   31.613563] RSP: 0018:ffff88007ad97b38  EFLAGS: 00010246
[   31.613563] RAX: 0000000000000000 RBX: ffff88007cd67910 RCX: 0000000000000200
[   31.613563] RDX: 0000000000002000 RSI: 0000000000000000 RDI: ffff88007cd67910
[   31.613563] RBP: ffff88007ad97b98 R08: 0000000000000200 R09: 0000000000000000
[   31.613563] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[   31.613563] R13: ffff88007cd67780 R14: 0000000000000000 R15: 0000000000000000
[   31.613563] FS:  0000000000000000(0000) GS:ffff88007fd80000(0000) knlGS:0000000000000000
[   31.613563] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   31.613563] CR2: 000000000000000c CR3: 000000007afd9000 CR4: 00000000000006e0
[   31.613563] Stack:
[   31.613563]  ffff88007ad97b98 ffffffff81c68fd6 ffffffff81c68f20 ffff88007c8e37c8
[   31.613563]  000000007b33d728 ffff88007dc805b0 ffff88007ad97c58 0000000000000200
[   31.613563]  ffff88007cd67780 ffff880000c00040 ffff88007ad97c00 ffff88007cd67910
[   31.613563] Call Trace:
[   31.613563]  [<ffffffff81c68fd6>] ? inet_sendpage+0xb6/0x130
[   31.613563]  [<ffffffff81c68f20>] ? inet_dgram_connect+0x80/0x80
[   31.613563]  [<ffffffff8197bd95>] iscsi_sw_tcp_pdu_xmit+0xe5/0x2e0
[   31.613563]  [<ffffffff8197badf>] ? iscsi_sw_tcp_pdu_init+0x1bf/0x390
[   31.613563]  [<ffffffff81979b82>] iscsi_tcp_task_xmit+0xa2/0x2b0
[   31.613563]  [<ffffffff81974815>] ? iscsi_xmit_task+0x45/0xd0
[   31.613563]  [<ffffffff810fbb8d>] ? trace_hardirqs_on+0xd/0x10
[   31.613563]  [<ffffffff810b54a0>] ? __local_bh_enable_ip+0x70/0xd0
[   31.613563]  [<ffffffff81974829>] iscsi_xmit_task+0x59/0xd0
[   31.613563]  [<ffffffff81978468>] iscsi_xmitworker+0x288/0x330
[   31.613563]  [<ffffffff810cc847>] process_one_work+0x1c7/0x490
[   31.613563]  [<ffffffff810cc7dd>] ? process_one_work+0x15d/0x490
[   31.613563]  [<ffffffff810cd539>] worker_thread+0x119/0x4f0
[   31.613563]  [<ffffffff810fbb8d>] ? trace_hardirqs_on+0xd/0x10
[   31.613563]  [<ffffffff810cd420>] ? init_pwq+0x190/0x190
[   31.613563]  [<ffffffff810d3c3f>] kthread+0xdf/0x100
[   31.613563]  [<ffffffff810d3b60>] ? __init_kthread_worker+0x70/0x70
[   31.613563]  [<ffffffff81d904bc>] ret_from_fork+0x7c/0xb0
[   31.613563]  [<ffffffff810d3b60>] ? __init_kthread_worker+0x70/0x70
[   31.613563] Code: 89 03 31 c0 e9 cc fe ff ff 0f 1f 44 00 00 48 8b 7b
30 e8 17 74 de ff 8b 53 10 c7 43 40 00 00 00 00 48 89 43 30 44 89 f6 48
89 df <8b> 40 0c 48 c7 03 00 00 00 00 2b 53 14 39 c2 0f 47 d0 89 53 08 


(gdb) l *(iscsi_tcp_segment_done+0x2bd)
0xffffffff8197b38d is in iscsi_tcp_segment_done
(../drivers/scsi/libiscsi_tcp.c:102).
97	iscsi_tcp_segment_init_sg(struct iscsi_segment *segment,
98				  struct scatterlist *sg, unsigned int offset)
99	{
100		segment->sg = sg;
101		segment->sg_offset = offset;
102		segment->size = min(sg->length - offset,
103				    segment->total_size - segment->total_copied);
104		segment->data = NULL;
105	}
106	

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Christie June 24, 2014, 6:31 a.m. UTC | #2
On 06/23/2014 03:59 PM, Christoph Hellwig wrote:
> This patch causes a regression when using the iscsi initiator over
> TCP for me. When mounting a newly created ext4 filesystem I get the
> following BUG: 
> 
> [   31.611803] BUG: unable to handle kernel NULL pointer dereference at 000000000000000c
> [   31.613563] IP: [<ffffffff8197b38d>] iscsi_tcp_segment_done+0x2bd/0x380
> [   31.613563] PGD 7a3e4067 PUD 7a45f067 PMD 0 
> [   31.613563] Oops: 0000 [#1] SMP 
> [   31.613563] Modules linked in:
> [   31.613563] CPU: 3 PID: 3739 Comm: kworker/u8:5 Not tainted 3.16.0-rc2 #187
> [   31.613563] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> [   31.613563] Workqueue: iscsi_q_2 iscsi_xmitworker
> [   31.613563] task: ffff88007b33cf10 ti: ffff88007ad94000 task.ti: ffff88007ad94000
> [   31.613563] RIP: 0010:[<ffffffff8197b38d>]  [<ffffffff8197b38d>] iscsi_tcp_segment_done+0x2bd/0x380
> [   31.613563] RSP: 0018:ffff88007ad97b38  EFLAGS: 00010246
> [   31.613563] RAX: 0000000000000000 RBX: ffff88007cd67910 RCX: 0000000000000200
> [   31.613563] RDX: 0000000000002000 RSI: 0000000000000000 RDI: ffff88007cd67910
> [   31.613563] RBP: ffff88007ad97b98 R08: 0000000000000200 R09: 0000000000000000
> [   31.613563] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
> [   31.613563] R13: ffff88007cd67780 R14: 0000000000000000 R15: 0000000000000000
> [   31.613563] FS:  0000000000000000(0000) GS:ffff88007fd80000(0000) knlGS:0000000000000000
> [   31.613563] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [   31.613563] CR2: 000000000000000c CR3: 000000007afd9000 CR4: 00000000000006e0
> [   31.613563] Stack:
> [   31.613563]  ffff88007ad97b98 ffffffff81c68fd6 ffffffff81c68f20 ffff88007c8e37c8
> [   31.613563]  000000007b33d728 ffff88007dc805b0 ffff88007ad97c58 0000000000000200
> [   31.613563]  ffff88007cd67780 ffff880000c00040 ffff88007ad97c00 ffff88007cd67910
> [   31.613563] Call Trace:
> [   31.613563]  [<ffffffff81c68fd6>] ? inet_sendpage+0xb6/0x130
> [   31.613563]  [<ffffffff81c68f20>] ? inet_dgram_connect+0x80/0x80
> [   31.613563]  [<ffffffff8197bd95>] iscsi_sw_tcp_pdu_xmit+0xe5/0x2e0
> [   31.613563]  [<ffffffff8197badf>] ? iscsi_sw_tcp_pdu_init+0x1bf/0x390
> [   31.613563]  [<ffffffff81979b82>] iscsi_tcp_task_xmit+0xa2/0x2b0
> [   31.613563]  [<ffffffff81974815>] ? iscsi_xmit_task+0x45/0xd0
> [   31.613563]  [<ffffffff810fbb8d>] ? trace_hardirqs_on+0xd/0x10
> [   31.613563]  [<ffffffff810b54a0>] ? __local_bh_enable_ip+0x70/0xd0
> [   31.613563]  [<ffffffff81974829>] iscsi_xmit_task+0x59/0xd0
> [   31.613563]  [<ffffffff81978468>] iscsi_xmitworker+0x288/0x330
> [   31.613563]  [<ffffffff810cc847>] process_one_work+0x1c7/0x490
> [   31.613563]  [<ffffffff810cc7dd>] ? process_one_work+0x15d/0x490
> [   31.613563]  [<ffffffff810cd539>] worker_thread+0x119/0x4f0
> [   31.613563]  [<ffffffff810fbb8d>] ? trace_hardirqs_on+0xd/0x10
> [   31.613563]  [<ffffffff810cd420>] ? init_pwq+0x190/0x190
> [   31.613563]  [<ffffffff810d3c3f>] kthread+0xdf/0x100
> [   31.613563]  [<ffffffff810d3b60>] ? __init_kthread_worker+0x70/0x70
> [   31.613563]  [<ffffffff81d904bc>] ret_from_fork+0x7c/0xb0
> [   31.613563]  [<ffffffff810d3b60>] ? __init_kthread_worker+0x70/0x70
> [   31.613563] Code: 89 03 31 c0 e9 cc fe ff ff 0f 1f 44 00 00 48 8b 7b
> 30 e8 17 74 de ff 8b 53 10 c7 43 40 00 00 00 00 48 89 43 30 44 89 f6 48
> 89 df <8b> 40 0c 48 c7 03 00 00 00 00 2b 53 14 39 c2 0f 47 d0 89 53 08 
> 
> 
> (gdb) l *(iscsi_tcp_segment_done+0x2bd)
> 0xffffffff8197b38d is in iscsi_tcp_segment_done
> (../drivers/scsi/libiscsi_tcp.c:102).
> 97	iscsi_tcp_segment_init_sg(struct iscsi_segment *segment,
> 98				  struct scatterlist *sg, unsigned int offset)
> 99	{
> 100		segment->sg = sg;
> 101		segment->sg_offset = offset;
> 102		segment->size = min(sg->length - offset,
> 103				    segment->total_size - segment->total_copied);
> 104		segment->data = NULL;
> 105	}
> 106	
> 


Ok, it looks like scsi_out(scsi_cmnd)->length (iscsi_tcp/libiscsi_tcp
still uses that for lower level operations since it was not converted to
support t10 pi) returns a different value than scsi_transfer_length()
(libiscsi uses this for higher level operations when it was converted to
t10 support since iser uses that module and also has t10 support) for
some commands. We then end up incorrectly thinking some requests are the
wrong size and then hit this. Looking into why exactly this happens.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh July 27, 2014, 10:08 a.m. UTC | #3
On 06/11/2014 12:09 PM, Sagi Grimberg wrote:
> In case protection information exists over the wire
> iscsi header data length is required to include it.
> Use protection information aware scsi helpers to set
> the correct transfer length.
> 
> In order to avoid breakage, remove iser transfer length
> checks for each task as they are not always true and
> somewhat redundant anyway.
> 
> Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
> Reviewed-by: Mike Christie <michaelc@cs.wisc.edu>
> ---
>  drivers/infiniband/ulp/iser/iser_initiator.c |   34 +++++++------------------
>  drivers/scsi/libiscsi.c                      |   18 +++++++-------
>  2 files changed, 19 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
> index 2e2d903..8d44a40 100644
> --- a/drivers/infiniband/ulp/iser/iser_initiator.c
> +++ b/drivers/infiniband/ulp/iser/iser_initiator.c
> @@ -41,11 +41,11 @@
>  #include "iscsi_iser.h"
>  
>  /* Register user buffer memory and initialize passive rdma
> - *  dto descriptor. Total data size is stored in
> - *  iser_task->data[ISER_DIR_IN].data_len
> + *  dto descriptor. Data size is stored in
> + *  task->data[ISER_DIR_IN].data_len, Protection size
> + *  os stored in task->prot[ISER_DIR_IN].data_len
>   */
> -static int iser_prepare_read_cmd(struct iscsi_task *task,
> -				 unsigned int edtl)
> +static int iser_prepare_read_cmd(struct iscsi_task *task)
>  
>  {
>  	struct iscsi_iser_task *iser_task = task->dd_data;
> @@ -73,14 +73,6 @@ static int iser_prepare_read_cmd(struct iscsi_task *task,
>  			return err;
>  	}
>  
> -	if (edtl > iser_task->data[ISER_DIR_IN].data_len) {
> -		iser_err("Total data length: %ld, less than EDTL: "
> -			 "%d, in READ cmd BHS itt: %d, conn: 0x%p\n",
> -			 iser_task->data[ISER_DIR_IN].data_len, edtl,
> -			 task->itt, iser_task->ib_conn);
> -		return -EINVAL;
> -	}
> -
>  	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
>  	if (err) {
>  		iser_err("Failed to set up Data-IN RDMA\n");
> @@ -100,8 +92,9 @@ static int iser_prepare_read_cmd(struct iscsi_task *task,
>  }
>  
>  /* Register user buffer memory and initialize passive rdma
> - *  dto descriptor. Total data size is stored in
> - *  task->data[ISER_DIR_OUT].data_len
> + *  dto descriptor. Data size is stored in
> + *  task->data[ISER_DIR_OUT].data_len, Protection size
> + *  is stored at task->prot[ISER_DIR_OUT].data_len
>   */
>  static int
>  iser_prepare_write_cmd(struct iscsi_task *task,
> @@ -135,14 +128,6 @@ iser_prepare_write_cmd(struct iscsi_task *task,
>  			return err;
>  	}
>  
> -	if (edtl > iser_task->data[ISER_DIR_OUT].data_len) {
> -		iser_err("Total data length: %ld, less than EDTL: %d, "
> -			 "in WRITE cmd BHS itt: %d, conn: 0x%p\n",
> -			 iser_task->data[ISER_DIR_OUT].data_len,
> -			 edtl, task->itt, task->conn);
> -		return -EINVAL;
> -	}
> -
>  	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
>  	if (err != 0) {
>  		iser_err("Failed to register write cmd RDMA mem\n");
> @@ -417,11 +402,12 @@ int iser_send_command(struct iscsi_conn *conn,
>  	if (scsi_prot_sg_count(sc)) {
>  		prot_buf->buf  = scsi_prot_sglist(sc);
>  		prot_buf->size = scsi_prot_sg_count(sc);
> -		prot_buf->data_len = sc->prot_sdb->length;
> +		prot_buf->data_len = data_buf->data_len >>
> +				     ilog2(sc->device->sector_size) * 8;
>  	}
>  
>  	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
> -		err = iser_prepare_read_cmd(task, edtl);
> +		err = iser_prepare_read_cmd(task);
>  		if (err)
>  			goto send_command_error;
>  	}
> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
> index 26dc005..3f46234 100644
> --- a/drivers/scsi/libiscsi.c
> +++ b/drivers/scsi/libiscsi.c
> @@ -338,7 +338,7 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>  	struct iscsi_session *session = conn->session;
>  	struct scsi_cmnd *sc = task->sc;
>  	struct iscsi_scsi_req *hdr;
> -	unsigned hdrlength, cmd_len;
> +	unsigned hdrlength, cmd_len, transfer_length;

I hate that you introduced this new transfer_length variable. It does
not exist. In BIDI supporting driver there is out_len and in_len just
as original code.

>  	itt_t itt;
>  	int rc;
>  
> @@ -391,11 +391,11 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>  	if (scsi_get_prot_op(sc) != SCSI_PROT_NORMAL)
>  		task->protected = true;
>  
> +	transfer_length = scsi_transfer_length(sc);
> +	hdr->data_length = cpu_to_be32(transfer_length);
>  	if (sc->sc_data_direction == DMA_TO_DEVICE) {
> -		unsigned out_len = scsi_out(sc)->length;

In light of all the comments and the obvious bugs, please just
re do this patch. Do not just later fix this one.

All you need is:
+		unsigned out_len = scsi_transfer_length(sc ,scsi_out(sc));

Also I would love if you left the addition to the user .I.E

		out_len += scsi_proto_length(sc ,scsi_out(sc));

This way we can see that this addition is because of scsi_proto and that
this particular driver puts them together in the same payload. There can be
other DIFF supporting drivers that put the DIFF payload on another stream / another
SG list, like the sata one, right?

Then  scsi_transfer_length() becomes:
static inline unsigned scsi_proto_length(struct scsi_cmnd *scmd, scsi_data_buffer *sdb)
{
	unsigned int prot_op = scsi_get_prot_op(scmd);
	unsigned int sector_size = scmd->device->sector_size;

	switch (prot_op) {
	case SCSI_PROT_NORMAL:
	case SCSI_PROT_WRITE_STRIP:
	case SCSI_PROT_READ_INSERT:
		return 0;
	}

	return (sdb->length >> ilog2(sector_size)) * 8;
}


>  		struct iscsi_r2t_info *r2t = &task->unsol_r2t;
>  
> -		hdr->data_length = cpu_to_be32(out_len);

And this stays as is

>  		hdr->flags |= ISCSI_FLAG_CMD_WRITE;
>  		/*
>  		 * Write counters:
> @@ -414,18 +414,19 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>  		memset(r2t, 0, sizeof(*r2t));
>  
>  		if (session->imm_data_en) {
> -			if (out_len >= session->first_burst)
> +			if (transfer_length >= session->first_burst)

And this stays as is

>  				task->imm_count = min(session->first_burst,
>  							conn->max_xmit_dlength);
>  			else
> -				task->imm_count = min(out_len,
> -							conn->max_xmit_dlength);
> +				task->imm_count = min(transfer_length,
> +						      conn->max_xmit_dlength);

And this stays as is

>  			hton24(hdr->dlength, task->imm_count);
>  		} else
>  			zero_data(hdr->dlength);
>  
>  		if (!session->initial_r2t_en) {
> -			r2t->data_length = min(session->first_burst, out_len) -
> +			r2t->data_length = min(session->first_burst,
> +					       transfer_length) -

And this stays as is

>  					       task->imm_count;
>  			r2t->data_offset = task->imm_count;
>  			r2t->ttt = cpu_to_be32(ISCSI_RESERVED_TAG);
> @@ -438,7 +439,6 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>  	} else {
>  		hdr->flags |= ISCSI_FLAG_CMD_FINAL;
>  		zero_data(hdr->dlength);
> -		hdr->data_length = cpu_to_be32(scsi_in(sc)->length);

And this stays as is

BTW: When reading DIFF devices, don't you have extra bits to read as well?
     How does the DIFF read works? Please get me up to speed. I'm not familiar
     with this protocol? 
     (I'd imagine that if say an app reads X bytes with DIFF info, it wants to
      receive its DIFF checksome for every sector in X, where is this info
      on the iscsi wire?)

>  
>  		if (sc->sc_data_direction == DMA_FROM_DEVICE)
>  			hdr->flags |= ISCSI_FLAG_CMD_READ;
> @@ -466,7 +466,7 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>  			  scsi_bidi_cmnd(sc) ? "bidirectional" :
>  			  sc->sc_data_direction == DMA_TO_DEVICE ?
>  			  "write" : "read", conn->id, sc, sc->cmnd[0],
> -			  task->itt, scsi_bufflen(sc),
> +			  task->itt, transfer_length,

And this

This print is correct as it covers all cases. If you want to print the adjusted
length then OK, you'd need to store this I guess, but store it as a different
variable like proto_length and print it as an additional variable.
The current print is one-to-one with what the caller requested, FS wrote X bytes.
If any was added for proto I'd like to see both prints.

>  			  scsi_bidi_cmnd(sc) ? scsi_in(sc)->length : 0,
>  			  session->cmdsn,
>  			  session->max_cmdsn - session->exp_cmdsn + 1);
> 


Rrrr

I see that this patch is already in mainline. I'd like to see the fixing
patch reverting all these wrong changes to the code and only leaving
the single needed change above.

I'll send a PATCH over linus/master later today.

Thanks
Boaz

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Aug. 6, 2014, 12:43 p.m. UTC | #4
Hi Boaz,

On 7/27/2014 1:08 PM, Boaz Harrosh wrote:
<SNIP>
>> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
>> index 26dc005..3f46234 100644
>> --- a/drivers/scsi/libiscsi.c
>> +++ b/drivers/scsi/libiscsi.c
>> @@ -338,7 +338,7 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>>   	struct iscsi_session *session = conn->session;
>>   	struct scsi_cmnd *sc = task->sc;
>>   	struct iscsi_scsi_req *hdr;
>> -	unsigned hdrlength, cmd_len;
>> +	unsigned hdrlength, cmd_len, transfer_length;
>
> I hate that you introduced this new transfer_length variable. It does
> not exist. In BIDI supporting driver there is out_len and in_len just
> as original code.

Effectively, out_len and in_len are the same except for the bidi case.
Moreover, the hdr->data_length is exactly the command scsi data buffer
length, the bidi length is taken from scsi_in when we build the AHS for
bidi (rlen_ahdr->read_length).

So to me it is correct and makes sense. But I'm don't feel so strong
about it...

Mike what's your take on this?

>
>>   	itt_t itt;
>>   	int rc;
>>
>> @@ -391,11 +391,11 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>>   	if (scsi_get_prot_op(sc) != SCSI_PROT_NORMAL)
>>   		task->protected = true;
>>
>> +	transfer_length = scsi_transfer_length(sc);
>> +	hdr->data_length = cpu_to_be32(transfer_length);
>>   	if (sc->sc_data_direction == DMA_TO_DEVICE) {
>> -		unsigned out_len = scsi_out(sc)->length;
>
> In light of all the comments and the obvious bugs, please just
> re do this patch. Do not just later fix this one.
>
> All you need is:
> +		unsigned out_len = scsi_transfer_length(sc ,scsi_out(sc));
>
> Also I would love if you left the addition to the user .I.E
>
> 		out_len += scsi_proto_length(sc ,scsi_out(sc));
>
> This way we can see that this addition is because of scsi_proto and that
> this particular driver puts them together in the same payload. There can be
> other DIFF supporting drivers that put the DIFF payload on another stream / another
> SG list, like the sata one, right?

I think that DIF specification says that on the wire the data and
protection are interleaved i.e. Block1, DIF1, Block2, DIF2...

So I do think that the transfer length should always include
data_length + protection_length.

>
> BTW: When reading DIFF devices, don't you have extra bits to read as well?
>       How does the DIFF read works? Please get me up to speed. I'm not familiar
>       with this protocol?
>       (I'd imagine that if say an app reads X bytes with DIFF info, it wants to
>        receive its DIFF checksome for every sector in X, where is this info
>        on the iscsi wire?)
>

The application wants to read X bytes from a DIF formatted device, the
initiator will prepare buffers for data and for DIF data (in some
implementations it can be the same buffer but not in Linux), and request
reading X+DIF_size bytes (where each block is followed by it's
corresponding integrity field) and place the data blocks in the data
buffer and the integrity fields in the DIF data buffer (usually this
is offloaded).

>>
>>   		if (sc->sc_data_direction == DMA_FROM_DEVICE)
>>   			hdr->flags |= ISCSI_FLAG_CMD_READ;
>> @@ -466,7 +466,7 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
>>   			  scsi_bidi_cmnd(sc) ? "bidirectional" :
>>   			  sc->sc_data_direction == DMA_TO_DEVICE ?
>>   			  "write" : "read", conn->id, sc, sc->cmnd[0],
>> -			  task->itt, scsi_bufflen(sc),
>> +			  task->itt, transfer_length,
>
> And this
>
> This print is correct as it covers all cases. If you want to print the adjusted
> length then OK, you'd need to store this I guess, but store it as a different
> variable like proto_length and print it as an additional variable.

But it is the transfer length that is sent in iSCSI header. Why do you
want to print it as additional info? for transactions that include DIF
the length is the data + protection.

> The current print is one-to-one with what the caller requested, FS wrote X bytes.

It is still one-to-one isn't it?

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh Aug. 6, 2014, 1:25 p.m. UTC | #5
On 08/06/2014 03:43 PM, Sagi Grimberg wrote:
> Hi Boaz,
> 
<>
>>
>> I hate that you introduced this new transfer_length variable. It does
>> not exist. In BIDI supporting driver there is out_len and in_len just
>> as original code.
> 
> Effectively, out_len and in_len are the same except for the bidi case.
> Moreover, the hdr->data_length is exactly the command scsi data buffer
> length, the bidi length is taken from scsi_in when we build the AHS for
> bidi (rlen_ahdr->read_length).
> 
> So to me it is correct and makes sense. But I'm don't feel so strong
> about it...
> 
> Mike what's your take on this?
> 

I have a patch to clean all that, will send tomorrow. 

What I mean is that this is on the out-path only the in path is different.
See the code this variable is only used in the if (== DMA_TO_DEVICE) case and
should be local to that scope. This is my clean up

<>
>> this particular driver puts them together in the same payload. There can be
>> other DIFF supporting drivers that put the DIFF payload on another stream / another
>> SG list, like the sata one, right?
> 
> I think that DIF specification says that on the wire the data and
> protection are interleaved i.e. Block1, DIF1, Block2, DIF2...
> 

No it does not. This is a per transport, and actually per device host
driver. Yes in iSCSI_tcp they are inline in HW cards they might come as
two different SGs (Like the Linux model). Even with iscsi-offload they might
want to be two SG-lists.

> So I do think that the transfer length should always include
> data_length + protection_length.
> 

This is at the iscsi level. But the scsi_transfer_length() is on the scsi
level which keeps them separate. So I think the proper API should be
	scsi_proto_length()

And for LLDs that want them together they can do scsi_bufflen() + scsi_proto_length()
and for other drivers they can do it separately. Don't infect iscsi level assumptions
on the generic layer API.

Again my patch fixes this.

>> And this
>>
>> This print is correct as it covers all cases. If you want to print the adjusted
>> length then OK, you'd need to store this I guess, but store it as a different
>> variable like proto_length and print it as an additional variable.
> 
> But it is the transfer length that is sent in iSCSI header. Why do you
> want to print it as additional info? 

I want to see what was the length the app/FS sent, then as added
info how much was added for DIFF, your way there is lost information.

> for transactions that include DIF
> the length is the data + protection.
> 
> It is still one-to-one isn't it?
> 

No! the original submitted length is lost from the print

> Sagi.
> 

Shalom
Boaz

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin K. Petersen Aug. 6, 2014, 3:54 p.m. UTC | #6
>>>>> "Sagi" == Sagi Grimberg <sagig@dev.mellanox.co.il> writes:

>> BTW: When reading DIFF devices, don't you have extra bits to read as
>> well?  How does the DIFF read works? Please get me up to speed. I'm
>> not familiar with this protocol?  (I'd imagine that if say an app
>> reads X bytes with DIFF info, it wants to receive its DIFF checksome
>> for every sector in X, where is this info on the iscsi wire?)
>> 

Sagi> The application wants to read X bytes from a DIF formatted device,
Sagi> the initiator will prepare buffers for data and for DIF data (in
Sagi> some implementations it can be the same buffer but not in Linux),
Sagi> and request reading X+DIF_size bytes (where each block is followed
Sagi> by it's corresponding integrity field) and place the data blocks
Sagi> in the data buffer and the integrity fields in the DIF data buffer
Sagi> (usually this is offloaded).

READ CAPACITY returns a block size of 512 even though the blocks on disk
are 520. That's the beauty of it.

At the command level all transfers are expressed in number of blocks,
not bytes. The application will see an N x 512 byte buffer but on the
wire between initiator and target we'll transfer N x 520 bytes.
Sagi Grimberg Aug. 13, 2014, 1:09 p.m. UTC | #7
On 8/6/2014 4:25 PM, Boaz Harrosh wrote:
> On 08/06/2014 03:43 PM, Sagi Grimberg wrote:
>> Hi Boaz,
>>
> <>
>>>
>>> I hate that you introduced this new transfer_length variable. It does
>>> not exist. In BIDI supporting driver there is out_len and in_len just
>>> as original code.
>>
>> Effectively, out_len and in_len are the same except for the bidi case.
>> Moreover, the hdr->data_length is exactly the command scsi data buffer
>> length, the bidi length is taken from scsi_in when we build the AHS for
>> bidi (rlen_ahdr->read_length).
>>
>> So to me it is correct and makes sense. But I'm don't feel so strong
>> about it...
>>
>> Mike what's your take on this?
>>
>
> I have a patch to clean all that, will send tomorrow.
>
> What I mean is that this is on the out-path only the in path is different.
> See the code this variable is only used in the if (== DMA_TO_DEVICE) case and
> should be local to that scope. This is my clean up
>

Hey Boaz,

So in the current flow, I still don't think it is wrong/buggy, the
transfer byte length related to scsi buffer length (In iscsi for sure 
but I think that for all transports it is the same).

But,
Since you have such a strong objection on this I'm also OK with changing
stuff... We can pass a buffer to scsi_transfer_length (although it has 
no meaning effectively) and we can keep in/out_len local variables and
print them along with total transfer.

Do you want me to send out a patch here (since I have some hardware to 
test it...) or are you still planning to send out something?

Cheers,
Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh Aug. 14, 2014, 7:17 a.m. UTC | #8
On 08/13/2014 04:09 PM, Sagi Grimberg wrote:
> On 8/6/2014 4:25 PM, Boaz Harrosh wrote:
> 
> Hey Boaz,
> 
> So in the current flow, I still don't think it is wrong/buggy, the
> transfer byte length related to scsi buffer length 
(In iscsi for sure 
> but I think that for all transports it is the same).
> 
> But,
> Since you have such a strong objection on this I'm also OK with changing
> stuff... We can pass a buffer to scsi_transfer_length (although it has 
> no meaning effectively) and we can keep in/out_len local variables and
> print them along with total transfer.
> 
> Do you want me to send out a patch here (since I have some hardware to 
> test it...) or are you still planning to send out something?
> 

I'll do it. As you said there is no bugs, just ugly code. I will send a
cleanup

Thanks
Boaz

> Cheers,
> Sagi.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 2e2d903..8d44a40 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -41,11 +41,11 @@ 
 #include "iscsi_iser.h"
 
 /* Register user buffer memory and initialize passive rdma
- *  dto descriptor. Total data size is stored in
- *  iser_task->data[ISER_DIR_IN].data_len
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_IN].data_len, Protection size
+ *  os stored in task->prot[ISER_DIR_IN].data_len
  */
-static int iser_prepare_read_cmd(struct iscsi_task *task,
-				 unsigned int edtl)
+static int iser_prepare_read_cmd(struct iscsi_task *task)
 
 {
 	struct iscsi_iser_task *iser_task = task->dd_data;
@@ -73,14 +73,6 @@  static int iser_prepare_read_cmd(struct iscsi_task *task,
 			return err;
 	}
 
-	if (edtl > iser_task->data[ISER_DIR_IN].data_len) {
-		iser_err("Total data length: %ld, less than EDTL: "
-			 "%d, in READ cmd BHS itt: %d, conn: 0x%p\n",
-			 iser_task->data[ISER_DIR_IN].data_len, edtl,
-			 task->itt, iser_task->ib_conn);
-		return -EINVAL;
-	}
-
 	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
 	if (err) {
 		iser_err("Failed to set up Data-IN RDMA\n");
@@ -100,8 +92,9 @@  static int iser_prepare_read_cmd(struct iscsi_task *task,
 }
 
 /* Register user buffer memory and initialize passive rdma
- *  dto descriptor. Total data size is stored in
- *  task->data[ISER_DIR_OUT].data_len
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_OUT].data_len, Protection size
+ *  is stored at task->prot[ISER_DIR_OUT].data_len
  */
 static int
 iser_prepare_write_cmd(struct iscsi_task *task,
@@ -135,14 +128,6 @@  iser_prepare_write_cmd(struct iscsi_task *task,
 			return err;
 	}
 
-	if (edtl > iser_task->data[ISER_DIR_OUT].data_len) {
-		iser_err("Total data length: %ld, less than EDTL: %d, "
-			 "in WRITE cmd BHS itt: %d, conn: 0x%p\n",
-			 iser_task->data[ISER_DIR_OUT].data_len,
-			 edtl, task->itt, task->conn);
-		return -EINVAL;
-	}
-
 	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
 	if (err != 0) {
 		iser_err("Failed to register write cmd RDMA mem\n");
@@ -417,11 +402,12 @@  int iser_send_command(struct iscsi_conn *conn,
 	if (scsi_prot_sg_count(sc)) {
 		prot_buf->buf  = scsi_prot_sglist(sc);
 		prot_buf->size = scsi_prot_sg_count(sc);
-		prot_buf->data_len = sc->prot_sdb->length;
+		prot_buf->data_len = data_buf->data_len >>
+				     ilog2(sc->device->sector_size) * 8;
 	}
 
 	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
-		err = iser_prepare_read_cmd(task, edtl);
+		err = iser_prepare_read_cmd(task);
 		if (err)
 			goto send_command_error;
 	}
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 26dc005..3f46234 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -338,7 +338,7 @@  static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 	struct iscsi_session *session = conn->session;
 	struct scsi_cmnd *sc = task->sc;
 	struct iscsi_scsi_req *hdr;
-	unsigned hdrlength, cmd_len;
+	unsigned hdrlength, cmd_len, transfer_length;
 	itt_t itt;
 	int rc;
 
@@ -391,11 +391,11 @@  static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 	if (scsi_get_prot_op(sc) != SCSI_PROT_NORMAL)
 		task->protected = true;
 
+	transfer_length = scsi_transfer_length(sc);
+	hdr->data_length = cpu_to_be32(transfer_length);
 	if (sc->sc_data_direction == DMA_TO_DEVICE) {
-		unsigned out_len = scsi_out(sc)->length;
 		struct iscsi_r2t_info *r2t = &task->unsol_r2t;
 
-		hdr->data_length = cpu_to_be32(out_len);
 		hdr->flags |= ISCSI_FLAG_CMD_WRITE;
 		/*
 		 * Write counters:
@@ -414,18 +414,19 @@  static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 		memset(r2t, 0, sizeof(*r2t));
 
 		if (session->imm_data_en) {
-			if (out_len >= session->first_burst)
+			if (transfer_length >= session->first_burst)
 				task->imm_count = min(session->first_burst,
 							conn->max_xmit_dlength);
 			else
-				task->imm_count = min(out_len,
-							conn->max_xmit_dlength);
+				task->imm_count = min(transfer_length,
+						      conn->max_xmit_dlength);
 			hton24(hdr->dlength, task->imm_count);
 		} else
 			zero_data(hdr->dlength);
 
 		if (!session->initial_r2t_en) {
-			r2t->data_length = min(session->first_burst, out_len) -
+			r2t->data_length = min(session->first_burst,
+					       transfer_length) -
 					       task->imm_count;
 			r2t->data_offset = task->imm_count;
 			r2t->ttt = cpu_to_be32(ISCSI_RESERVED_TAG);
@@ -438,7 +439,6 @@  static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 	} else {
 		hdr->flags |= ISCSI_FLAG_CMD_FINAL;
 		zero_data(hdr->dlength);
-		hdr->data_length = cpu_to_be32(scsi_in(sc)->length);
 
 		if (sc->sc_data_direction == DMA_FROM_DEVICE)
 			hdr->flags |= ISCSI_FLAG_CMD_READ;
@@ -466,7 +466,7 @@  static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 			  scsi_bidi_cmnd(sc) ? "bidirectional" :
 			  sc->sc_data_direction == DMA_TO_DEVICE ?
 			  "write" : "read", conn->id, sc, sc->cmnd[0],
-			  task->itt, scsi_bufflen(sc),
+			  task->itt, transfer_length,
 			  scsi_bidi_cmnd(sc) ? scsi_in(sc)->length : 0,
 			  session->cmdsn,
 			  session->max_cmdsn - session->exp_cmdsn + 1);