From patchwork Mon Oct 12 22:57:23 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Vu Pham X-Patchwork-Id: 53259 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n9CN8mFr008004 for ; Mon, 12 Oct 2009 23:08:48 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933228AbZJLXD4 (ORCPT ); Mon, 12 Oct 2009 19:03:56 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S933264AbZJLXD4 (ORCPT ); Mon, 12 Oct 2009 19:03:56 -0400 Received: from p02c12o145.mxlogic.net ([208.65.145.78]:45608 "EHLO p02c12o145.mxlogic.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933228AbZJLXD4 (ORCPT ); Mon, 12 Oct 2009 19:03:56 -0400 Received: from unknown [63.251.237.3] (EHLO p02c12o145.mxlogic.net) by p02c12o145.mxlogic.net(mxl_mta-6.4.0-0) with ESMTP id 0e5b3da4.bb4d6b90.74105.00-511.159038.p02c12o145.mxlogic.net (envelope-from ); Mon, 12 Oct 2009 17:04:00 -0600 (MDT) X-MXL-Hash: 4ad3b5e003a1efd9-d8a471b80a3ae8fa9901e0f1dc186179de127618 Received: from unknown [63.251.237.3] by p02c12o145.mxlogic.net(mxl_mta-6.4.0-0) with SMTP id 5a4b3da4.0.73309.00-009.157754.p02c12o145.mxlogic.net (envelope-from ); Mon, 12 Oct 2009 16:59:26 -0600 (MDT) X-MXL-Hash: 4ad3b4ce78d86aa6-84aed59c9b2afd42bc571925dd84b25e97828db1 Received: from [10.2.1.145] ([10.2.1.145]) by mtiexch01.mti.com with Microsoft SMTPSVC(6.0.3790.3959); Mon, 12 Oct 2009 16:00:03 -0700 Message-ID: <4AD3B453.3030109@mellanox.com> Date: Mon, 12 Oct 2009 15:57:23 -0700 From: Vu Pham User-Agent: Thunderbird 2.0.0.23 (Windows/20090812) MIME-Version: 1.0 To: Linux RDMA list Subject: [ofa-general][PATCH 3/4] SRP fail-over faster X-OriginalArrivalTime: 12 Oct 2009 23:00:03.0781 (UTC) FILETIME=[BBDE8750:01CA4B8F] X-Spam: [F=0.2000000000; CM=0.500; S=0.200(2009092101)] X-MAIL-FROM: X-SOURCE-IP: [63.251.237.3] X-AnalysisOut: [v=1.0 c=1 a=58m7CfzORCEA:10 a=xupnbh4h0YLOHZnncC45HQ==:17 ] X-AnalysisOut: [a=CbDCq_QkAAAA:8 a=PwJ2IG6fAAAA:8 a=AUd_NHdVAAAA:8 a=mSIgo] X-AnalysisOut: [ubUyuc5rLjmprkA:9 a=O7OkZntnFGRI0lrWWSkA:7 a=_rFSEwfNx-pd9] X-AnalysisOut: [71CmWn7bf8l0cQA:4 a=E3yz0KKPV6YA:10 a=JfD0Fch1gWkA:10 a=_w] X-AnalysisOut: [LNh3oqJEw-ivR9:21 a=esl7JsWiReFRUl1s:21] Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org Index: ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.c =================================================================== --- ofed_kernel.orig/drivers/infiniband/ulp/srp/ib_srp.c +++ ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.c @@ -78,6 +77,13 @@ MODULE_PARM_DESC(mellanox_workarounds, "Enable workarounds for Mellanox SRP target bugs if != 0"); +static int srp_dev_loss_tmo = 60; + +module_param(srp_dev_loss_tmo, int, 0444); +MODULE_PARM_DESC(srp_dev_loss_tmo, + "Default number of seconds that srp transport should \ + insulate the lost of a remote port (default is 60 secs"); + static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); static void srp_completion(struct ib_cq *cq, void *target_ptr); @@ -898,6 +926,48 @@ DMA_FROM_DEVICE); } +static void srp_reconnect_work(struct work_struct *work) +{ + struct srp_target_port *target = + container_of(work, struct srp_target_port, work); + + srp_reconnect_target(target); + target->work_in_progress = 0; +} + +static void srp_qp_in_err_timer(unsigned long data) +{ + struct srp_target_port *target = (struct srp_target_port *)data; + struct srp_request *req, *tmp; + + if (target->state != SRP_TARGET_LIVE) + return; + + spin_lock_irq(target->scsi_host->host_lock); + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req); + spin_unlock_irq(target->scsi_host->host_lock); + + spin_lock_irq(target->scsi_host->host_lock); + if (!target->work_in_progress) { + target->work_in_progress = 1; + INIT_WORK(&target->work, srp_reconnect_work); + schedule_work(&target->work); + } + spin_unlock_irq(target->scsi_host->host_lock); +} + +static void srp_qp_err_add_timer(struct srp_target_port *target, int time) +{ + if (!timer_pending(&target->qp_err_timer)) { + setup_timer(&target->qp_err_timer, + srp_qp_in_err_timer, + (unsigned long)target); + target->qp_err_timer.expires = time * HZ + jiffies; + add_timer(&target->qp_err_timer); + } +} + static void srp_completion(struct ib_cq *cq, void *target_ptr) { struct srp_target_port *target = target_ptr; @@ -960,11 +980,20 @@ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { if (wc.status) { + unsigned long flags; + shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %d\n", wc.wr_id & SRP_OP_RECV ? "receive" : "send", wc.status); - target->qp_in_error = 1; + spin_lock_irqsave(target->scsi_host->host_lock, flags); + if (!target->qp_in_error && + target->state == SRP_TARGET_LIVE) { + target->qp_in_error = 1; + srp_qp_err_add_timer(target, + srp_dev_loss_tmo - 55); + } + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); break; } @@ -1274,5 +1299,6 @@ int attr_mask = 0; int comp = 0; int opcode = 0; + unsigned long flags; switch (event->event) { @@ -1301,6 +1381,14 @@ shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); + spin_lock_irqsave(target->scsi_host->host_lock, flags); + if (!target->qp_in_error && + target->state == SRP_TARGET_LIVE) { + target->qp_in_error = 1; + srp_qp_err_add_timer(target, + srp_dev_loss_tmo - 55); + } + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); target->status = 0; break; @@ -1443,9 +1529,22 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_request *req, *tmp; int ret = FAILED; - shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "SRP reset_host called state %d qp_err %d\n", + target->state, target->qp_in_error); + + spin_lock_irq(target->scsi_host->host_lock); + if (timer_pending(&target->qp_err_timer) || target->qp_in_error || + target->state != SRP_TARGET_LIVE) { + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req); + spin_unlock_irq(target->scsi_host->host_lock); + return SUCCESS; + } + spin_unlock_irq(target->scsi_host->host_lock); if (!srp_reconnect_target(target)) ret = SUCCESS; @@ -2150,6 +2342,9 @@ sizeof (struct srp_indirect_buf) + srp_sg_tablesize * 16); + if (srp_dev_loss_tmo < 60) + srp_dev_loss_tmo = 60; + ret = class_register(&srp_class); if (ret) { printk(KERN_ERR PFX "couldn't register class infiniband_srp\n"); Index: ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.h =================================================================== --- ofed_kernel.orig/drivers/infiniband/ulp/srp/ib_srp.h +++ ofed_kernel/drivers/infiniband/ulp/srp/ib_srp.h @@ -153,12 +159,14 @@ struct srp_request req_ring[SRP_SQ_SIZE]; struct work_struct work; + int work_in_progress; struct list_head list; struct completion done; int status; enum srp_target_state state; int qp_in_error; + struct timer_list qp_err_timer; }; struct srp_iu {