From patchwork Tue Nov 26 12:24:20 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xiubo Li X-Patchwork-Id: 11262087 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id C1B111390 for ; Tue, 26 Nov 2019 12:24:46 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id A19442068E for ; Tue, 26 Nov 2019 12:24:46 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="MvAuYerd" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728365AbfKZMYq (ORCPT ); Tue, 26 Nov 2019 07:24:46 -0500 Received: from us-smtp-2.mimecast.com ([205.139.110.61]:41191 "EHLO us-smtp-delivery-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1728361AbfKZMYp (ORCPT ); Tue, 26 Nov 2019 07:24:45 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1574771084; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=0434QUXjPvIaHT1EaIkqlZLBAS2xooOlYe242h7NpmM=; b=MvAuYerdJMd1mpfrECow+yFAp+ruwgaZYPYEYnrRN289QSmmW5unSXZFn7v+72R5FcNkpu LWB4hRCh3Toqw+BYwVDldHbegvfO3Ozfdr1PCMVGV/OBq3YFk2HBe98R3YSYIxH1uKmvKG ssAmTlVAIv/VxpjtKZu/Pj2VMZ55Nrs= Received: from mimecast-mx01.redhat.com (mimecast-mx01.redhat.com [209.132.183.4]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-384-gmd6WlKnOY29-wKYDhxCew-1; Tue, 26 Nov 2019 07:24:42 -0500 Received: from smtp.corp.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com [10.5.11.15]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mimecast-mx01.redhat.com (Postfix) with ESMTPS id DE65780183C; Tue, 26 Nov 2019 12:24:41 +0000 (UTC) Received: from localhost.localdomain (ovpn-12-66.pek2.redhat.com [10.72.12.66]) by smtp.corp.redhat.com (Postfix) with ESMTP id 27D765D6BE; Tue, 26 Nov 2019 12:24:38 +0000 (UTC) From: xiubli@redhat.com To: jlayton@kernel.org, zyan@redhat.com Cc: sage@redhat.com, idryomov@gmail.com, pdonnell@redhat.com, ceph-devel@vger.kernel.org, Xiubo Li Subject: [PATCH v3 1/3] mdsmap: add more debug info when decoding Date: Tue, 26 Nov 2019 07:24:20 -0500 Message-Id: <20191126122422.12396-2-xiubli@redhat.com> In-Reply-To: <20191126122422.12396-1-xiubli@redhat.com> References: <20191126122422.12396-1-xiubli@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.79 on 10.5.11.15 X-MC-Unique: gmd6WlKnOY29-wKYDhxCew-1 X-Mimecast-Spam-Score: 0 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org From: Xiubo Li Show the laggy state. Signed-off-by: Xiubo Li --- fs/ceph/mdsmap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index aeec1d6e3769..471bac335fae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -158,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; + bool laggy; ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); @@ -190,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (err) goto corrupt; ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -207,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p = info_end; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr), - ceph_mds_state_name(state)); + ceph_mds_state_name(state), + laggy ? "(laggy)" : ""); if (mds < 0 || state <= 0) continue; @@ -230,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->global_id = global_id; info->state = state; info->addr = addr; - info->laggy = (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); + info->laggy = laggy; info->num_export_targets = num_export_targets; if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, @@ -355,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_damaged = false; } bad_ext: + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; From patchwork Tue Nov 26 12:24:21 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xiubo Li X-Patchwork-Id: 11262089 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 9AA621393 for ; Tue, 26 Nov 2019 12:24:50 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 6D6312068E for ; Tue, 26 Nov 2019 12:24:50 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="SvtrLYi7" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728367AbfKZMYt (ORCPT ); Tue, 26 Nov 2019 07:24:49 -0500 Received: from us-smtp-delivery-1.mimecast.com ([205.139.110.120]:38753 "EHLO us-smtp-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1728361AbfKZMYt (ORCPT ); Tue, 26 Nov 2019 07:24:49 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1574771087; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=OTYKAGWqYnQv15no50a3AVj4c5rB9k3jvQCPpR/jFiw=; b=SvtrLYi7wUjunPGUY1aDSD4ya5mlozVKY6QsEMZ32R3dZrAhm4eRLkirPlFYRBgLO0AY3z Gd3RK65QKaf54/1ntwKYKSwXw3AwZQfUeGJwWSWNNe7qsrHYTxWiYkM2Va4Kh4kxr2ykjj 3rM9Zx2h9feH//gPhfpFEZddXsIIGwY= Received: from mimecast-mx01.redhat.com (mimecast-mx01.redhat.com [209.132.183.4]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-374-GJd8MI3MO9iuT-lbcbOCZA-1; Tue, 26 Nov 2019 07:24:46 -0500 Received: from smtp.corp.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com [10.5.11.15]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mimecast-mx01.redhat.com (Postfix) with ESMTPS id 2B44C1800D40; Tue, 26 Nov 2019 12:24:45 +0000 (UTC) Received: from localhost.localdomain (ovpn-12-66.pek2.redhat.com [10.72.12.66]) by smtp.corp.redhat.com (Postfix) with ESMTP id 762985D6BE; Tue, 26 Nov 2019 12:24:42 +0000 (UTC) From: xiubli@redhat.com To: jlayton@kernel.org, zyan@redhat.com Cc: sage@redhat.com, idryomov@gmail.com, pdonnell@redhat.com, ceph-devel@vger.kernel.org, Xiubo Li Subject: [PATCH v3 2/3] mdsmap: fix mdsmap cluster available check based on laggy number Date: Tue, 26 Nov 2019 07:24:21 -0500 Message-Id: <20191126122422.12396-3-xiubli@redhat.com> In-Reply-To: <20191126122422.12396-1-xiubli@redhat.com> References: <20191126122422.12396-1-xiubli@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.79 on 10.5.11.15 X-MC-Unique: GJd8MI3MO9iuT-lbcbOCZA-1 X-Mimecast-Spam-Score: 0 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org From: Xiubo Li In case the max_mds > 1 in MDS cluster and there is no any standby MDS and all the max_mds MDSs are in up:active state, if one of the up:active MDSs is dead, the m->m_num_laggy in kclient will be 1. Then the mount will fail without considering other healthy MDSs. There manybe some MDSs still "in" the cluster but not in up:active state, we will ignore them. Only when all the up:active MDSs in the cluster are laggy will treat the cluster as not be available. In case decreasing the max_mds, the cluster will not stop the extra up:active MDSs immediately and there will be a latency. During it the up:active MDS number will be larger than the max_mds, so later the m_info memories will 100% be reallocated. Here will pick out the up:active MDSs as the m_num_mds and allocate the needed memories once. Signed-off-by: Xiubo Li --- fs/ceph/mdsmap.c | 38 +++++++++++++++++-------------------- include/linux/ceph/mdsmap.h | 5 +++-- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 471bac335fae..3418cf2c6a12 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -138,14 +138,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_session_autoclose = ceph_decode_32(p); m->m_max_file_size = ceph_decode_64(p); m->m_max_mds = ceph_decode_32(p); - m->m_num_mds = m->m_max_mds; + + /* + * pick out the active nodes as the m_num_mds, the m_num_mds + * maybe larger than m_max_mds when decreasing the max_mds in + * cluster side, in other case it should less than or equal + * to m_max_mds. + */ + m->m_num_mds = n = ceph_decode_32(p); + m->m_num_active_mds = m->m_num_mds; m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); if (!m->m_info) goto nomem; /* pick out active nodes from mds_info (state > 0) */ - n = ceph_decode_32(p); for (i = 0; i < n; i++) { u64 global_id; u32 namelen; @@ -215,18 +222,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_mds_state_name(state), laggy ? "(laggy)" : ""); - if (mds < 0 || state <= 0) + if (mds < 0 || mds >= m->m_num_mds) { + pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); continue; + } - if (mds >= m->m_num_mds) { - int new_num = max(mds + 1, m->m_num_mds * 2); - void *new_m_info = krealloc(m->m_info, - new_num * sizeof(*m->m_info), - GFP_NOFS | __GFP_ZERO); - if (!new_m_info) - goto nomem; - m->m_info = new_m_info; - m->m_num_mds = new_num; + if (state <= 0) { + pr_warn("mdsmap_decode got incorrect state(%s)\n", + ceph_mds_state_name(state)); + continue; } info = &m->m_info[mds]; @@ -247,14 +251,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->export_targets = NULL; } } - if (m->m_num_mds > m->m_max_mds) { - /* find max up mds */ - for (i = m->m_num_mds; i >= m->m_max_mds; i--) { - if (i == 0 || m->m_info[i-1].state > 0) - break; - } - m->m_num_mds = i; - } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); @@ -396,7 +392,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) return false; if (m->m_damaged) return false; - if (m->m_num_laggy > 0) + if (m->m_num_laggy == m->m_num_active_mds) return false; for (i = 0; i < m->m_num_mds; i++) { if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 0067d767c9ae..3a66f4f926ce 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,8 +25,9 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; - u32 m_max_mds; /* size of m_addr, m_state arrays */ - int m_num_mds; + u32 m_max_mds; /* expected up:active mds number */ + int m_num_active_mds; /* actual up:active mds number */ + int m_num_mds; /* size of m_info array */ struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ From patchwork Tue Nov 26 12:24:22 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xiubo Li X-Patchwork-Id: 11262091 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id ED4461393 for ; Tue, 26 Nov 2019 12:24:53 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id CE1B62068E for ; Tue, 26 Nov 2019 12:24:53 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="F2BkRN5T" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728373AbfKZMYx (ORCPT ); Tue, 26 Nov 2019 07:24:53 -0500 Received: from us-smtp-delivery-1.mimecast.com ([207.211.31.120]:38559 "EHLO us-smtp-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1728361AbfKZMYw (ORCPT ); Tue, 26 Nov 2019 07:24:52 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1574771091; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=95xHkjFfntMGoj7eo2xSGgnX9aGcbuQcnLi3CaS/buI=; b=F2BkRN5TsIgyLV9PRr30xLjZlFsRGV7Mqu3nM46Vu1mgvxoTkRnUTDWG7HOzoeIqYL1r/G hseNKFQYcbD9jHlx+vtUofJy6RILC6guhN2/hAxJqiGPObdVdxThpk6Fkkf2bk5UdkJQpC K3JpTFTtd8+zd+CIGMAd10bHQes9gV4= Received: from mimecast-mx01.redhat.com (mimecast-mx01.redhat.com [209.132.183.4]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-359-i-iLk7HAOfGXzveMwgqrbg-1; Tue, 26 Nov 2019 07:24:49 -0500 Received: from smtp.corp.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com [10.5.11.15]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mimecast-mx01.redhat.com (Postfix) with ESMTPS id 6F40E477; Tue, 26 Nov 2019 12:24:48 +0000 (UTC) Received: from localhost.localdomain (ovpn-12-66.pek2.redhat.com [10.72.12.66]) by smtp.corp.redhat.com (Postfix) with ESMTP id B60325D6C3; Tue, 26 Nov 2019 12:24:45 +0000 (UTC) From: xiubli@redhat.com To: jlayton@kernel.org, zyan@redhat.com Cc: sage@redhat.com, idryomov@gmail.com, pdonnell@redhat.com, ceph-devel@vger.kernel.org, Xiubo Li Subject: [PATCH v3 3/3] mdsmap: only choose one MDS who is in up:active state without laggy Date: Tue, 26 Nov 2019 07:24:22 -0500 Message-Id: <20191126122422.12396-4-xiubli@redhat.com> In-Reply-To: <20191126122422.12396-1-xiubli@redhat.com> References: <20191126122422.12396-1-xiubli@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.79 on 10.5.11.15 X-MC-Unique: i-iLk7HAOfGXzveMwgqrbg-1 X-Mimecast-Spam-Score: 0 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org From: Xiubo Li Even the MDS is in up:active state, but it also maybe laggy. Here will skip the laggy MDSs. Signed-off-by: Xiubo Li --- fs/ceph/mds_client.c | 13 +++++++++---- fs/ceph/mdsmap.c | 30 +++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0444288fe87e..2c92a1452876 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -972,14 +972,14 @@ static int __choose_mds(struct ceph_mds_client *mdsc, frag.frag, mds, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) + CEPH_MDS_STATE_ACTIVE && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } /* since this file/dir wasn't known to be * replicated, then we want to look for the * authoritative mds. */ - mode = USE_AUTH_MDS; if (frag.mds >= 0) { /* choose auth mds */ mds = frag.mds; @@ -987,9 +987,14 @@ static int __choose_mds(struct ceph_mds_client *mdsc, "frag %u mds%d (auth)\n", inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) - goto out; + CEPH_MDS_STATE_ACTIVE) { + if (mode == USE_ANY_MDS && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, + mds)) + goto out; + } } + mode = USE_AUTH_MDS; } } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 3418cf2c6a12..284d68646c40 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -13,22 +13,24 @@ #include "super.h" +#define CEPH_MDS_IS_READY(i, ignore_laggy) \ + (m->m_info[i].state > 0 && (ignore_laggy ? true : !m->m_info[i].laggy)) -/* - * choose a random mds that is "up" (i.e. has a state > 0), or -1. - */ -int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) { int n = 0; int i, j; - /* special case for one mds */ + /* + * special case for one mds, no matter it is laggy or + * not we have no choice + */ if (1 == m->m_num_mds && m->m_info[0].state > 0) return 0; /* count */ for (i = 0; i < m->m_num_mds; i++) - if (m->m_info[i].state > 0) + if (CEPH_MDS_IS_READY(i, ignore_laggy)) n++; if (n == 0) return -1; @@ -36,7 +38,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) /* pick */ n = prandom_u32() % n; for (j = 0, i = 0; i < m->m_num_mds; i++) { - if (m->m_info[i].state > 0) + if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; if (j > n) break; @@ -45,6 +47,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return i; } +/* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int mds; + + mds = __mdsmap_get_random_mds(m, false); + if (mds == m->m_num_mds || mds == -1) + mds = __mdsmap_get_random_mds(m, true); + + return mds == m->m_num_mds ? -1 : mds; +} + #define __decode_and_drop_type(p, end, type, bad) \ do { \ if (*p + sizeof(type) > end) \