From patchwork Tue Dec 12 16:24:31 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489588
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="ROZISrpa"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 80E37E4;
	Tue, 12 Dec 2023 08:30:20 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 173401E11A;
	Tue, 12 Dec 2023 18:30:19 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 00A0A1DE4B;
	Tue, 12 Dec 2023 18:30:19 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 086383C07D0;
	Tue, 12 Dec 2023 18:30:12 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398612; bh=LCqJ4mlbdB9LVyu6Ty+lZ84MDsYbF5Y4CPShya8Z0/c=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=ROZISrpae/UfddFIYXmidcimd6A0vz0/SIgROhThJNWUKFe5PBse2sNF6CuK6Ca4k
	 LvtKvtQN4FmwA+wOXwBTJNd5UskX9iA3urpex3TCPlA4+NgrS2u3QDM+GhwjRkJsra
	 urCRj8y3KPaunG/M+5FlT3Hr99tMG6lKRpMchm98=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQT8f094048;
	Tue, 12 Dec 2023 18:26:29 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQT3U094047;
	Tue, 12 Dec 2023 18:26:29 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 01/14] rculist_bl: add
 hlist_bl_for_each_entry_continue_rcu
Date: Tue, 12 Dec 2023 18:24:31 +0200
Message-ID: <20231212162444.93801-2-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Change the old hlist_bl_first_rcu to hlist_bl_first_rcu_dereference
to indicate that it is a RCU dereference.

Add hlist_bl_next_rcu and hlist_bl_first_rcu to use RCU pointers
and use them to fix sparse warnings.

Add hlist_bl_for_each_entry_continue_rcu.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/linux/rculist_bl.h | 49 +++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
index 0b952d06eb0b..36363b876e53 100644
--- a/include/linux/rculist_bl.h
+++ b/include/linux/rculist_bl.h
@@ -8,21 +8,31 @@
 #include <linux/list_bl.h>
 #include <linux/rcupdate.h>
 
+/* return the first ptr or next element in an RCU protected list */
+#define hlist_bl_first_rcu(head)	\
+	(*((struct hlist_bl_node __rcu **)(&(head)->first)))
+#define hlist_bl_next_rcu(node)	\
+	(*((struct hlist_bl_node __rcu **)(&(node)->next)))
+
 static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
 					struct hlist_bl_node *n)
 {
 	LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
 	LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
 							LIST_BL_LOCKMASK);
-	rcu_assign_pointer(h->first,
+	rcu_assign_pointer(hlist_bl_first_rcu(h),
 		(struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
 }
 
-static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
-{
-	return (struct hlist_bl_node *)
-		((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
-}
+#define hlist_bl_first_rcu_dereference(head)				\
+({									\
+	struct hlist_bl_head *__head = (head);				\
+									\
+	(struct hlist_bl_node *)					\
+	((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \
+					      hlist_bl_is_locked(__head)) & \
+					      ~LIST_BL_LOCKMASK);	\
+})
 
 /**
  * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
@@ -73,7 +83,7 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
 {
 	struct hlist_bl_node *first;
 
-	/* don't need hlist_bl_first_rcu because we're under lock */
+	/* don't need hlist_bl_first_rcu* because we're under lock */
 	first = hlist_bl_first(h);
 
 	n->next = first;
@@ -93,9 +103,30 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
  *
  */
 #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)		\
-	for (pos = hlist_bl_first_rcu(head);				\
+	for (pos = hlist_bl_first_rcu_dereference(head);		\
 		pos &&							\
 		({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference_raw(pos->next))
+		pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))
+
+/**
+ * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given
+ *   type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_bl_node to use as a loop cursor.
+ * @member:	the name of the hlist_bl_node within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position which must have been in the list when the RCU read
+ * lock was taken.
+ * This would typically require either that you obtained the node from a
+ * previous walk of the list in the same RCU read-side critical section, or
+ * that you held some sort of non-RCU reference (such as a reference count)
+ * to keep the node alive *and* in the list.
+ */
+#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member)		\
+	for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \
+	     pos &&							\
+	     ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
+	     pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))
 
 #endif

From patchwork Tue Dec 12 16:24:32 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489597
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="sZIy9T+J"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 97CB6F2;
	Tue, 12 Dec 2023 08:30:56 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 3D3971E2A2;
	Tue, 12 Dec 2023 18:30:55 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 25AE41E166;
	Tue, 12 Dec 2023 18:30:55 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 4C8673C07D9;
	Tue, 12 Dec 2023 18:30:13 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398613; bh=283+qi0/xH2KVO9dj+/Q/5nqY/xgffAAzTfNM6vftiw=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=sZIy9T+J8NX5fcXDKsI/VuaZPXwCLMpffSGNv0DZPidKaD4O0fSaVhRqoNJipEQgq
	 iYKab2q3rrDdAlNFkyVplx3dzzSax72QY2YI2KROFAlaeyecpQTT4u78YqxE/tq7SE
	 hI48tp1DEL4Cj9AQuPU/xMCP98E05EpA0S6h3D68=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQTaT094054;
	Tue, 12 Dec 2023 18:26:29 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQTV0094052;
	Tue, 12 Dec 2023 18:26:29 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 02/14] ipvs: make ip_vs_svc_table and
 ip_vs_svc_fwm_table per netns
Date: Tue, 12 Dec 2023 18:24:32 +0200
Message-ID: <20231212162444.93801-3-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

From: Jiejian Wu <jiejian@linux.alibaba.com>

Current ipvs uses one global mutex "__ip_vs_mutex" to keep the global
"ip_vs_svc_table" and "ip_vs_svc_fwm_table" safe. But when there are
tens of thousands of services from different netns in the table, it
takes a long time to look up the table, for example, using "ipvsadm
-ln" from different netns simultaneously.

We make "ip_vs_svc_table" and "ip_vs_svc_fwm_table" per netns, and we
add "service_mutex" per netns to keep these two tables safe instead of
the global "__ip_vs_mutex" in current version. To this end, looking up
services from different netns simultaneously will not get stuck,
shortening the time consumption in large-scale deployment. It can be
reproduced using the simple scripts below.

init.sh: #!/bin/bash
for((i=1;i<=4;i++));do
        ip netns add ns$i
        ip netns exec ns$i ip link set dev lo up
        ip netns exec ns$i sh add-services.sh
done

add-services.sh: #!/bin/bash
for((i=0;i<30000;i++)); do
        ipvsadm -A  -t 10.10.10.10:$((80+$i)) -s rr
done

runtest.sh: #!/bin/bash
for((i=1;i<4;i++));do
        ip netns exec ns$i ipvsadm -ln > /dev/null &
done
ip netns exec ns4 ipvsadm -ln > /dev/null

Run "sh init.sh" to initiate the network environment. Then run "time
./runtest.sh" to evaluate the time consumption. Our testbed is a 4-core
Intel Xeon ECS. The result of the original version is around 8 seconds,
while the result of the modified version is only 0.8 seconds.

Signed-off-by: Jiejian Wu <jiejian@linux.alibaba.com>
Co-developed-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h            |  13 +++
 net/netfilter/ipvs/ip_vs_ctl.c | 167 ++++++++++++++-------------------
 net/netfilter/ipvs/ip_vs_est.c |  18 ++--
 3 files changed, 94 insertions(+), 104 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ff406ef4fd4a..68e562bc9df2 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -33,6 +33,12 @@
 
 #define IP_VS_HDR_INVERSE	1
 #define IP_VS_HDR_ICMP		2
+/*
+ *	Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
 
 /* Generic access of ipvs struct */
 static inline struct netns_ipvs *net_ipvs(struct net* net)
@@ -1041,6 +1047,13 @@ struct netns_ipvs {
 	 */
 	unsigned int		mixed_address_family_dests;
 	unsigned int		hooks_afmask;	/* &1=AF_INET, &2=AF_INET6 */
+
+	/* the service mutex that protect svc_table and svc_fwm_table */
+	struct mutex service_mutex;
+	/* the service table hashed by <protocol, addr, port> */
+	struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE];
+	/* the service table hashed by fwmark */
+	struct hlist_head svc_fwm_table[IP_VS_SVC_TAB_SIZE];
 };
 
 #define DEFAULT_SYNC_THRESHOLD	3
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 143a341bbc0a..7189bf6bd371 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -49,7 +49,7 @@
 
 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
 
-DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
+static struct lock_class_key __ipvs_service_key;
 
 /* sysctl variables */
 
@@ -294,17 +294,6 @@ ip_vs_use_count_dec(void)
 }
 
 
-/*
- *	Hash table: for virtual service lookups
- */
-#define IP_VS_SVC_TAB_BITS 8
-#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
-#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
-
-/* the service table hashed by <protocol, addr, port> */
-static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
-/* the service table hashed by fwmark */
-static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
 
 
 /*
@@ -339,8 +328,8 @@ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32
 }
 
 /*
- *	Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
- *	or in the ip_vs_svc_fwm_table by fwmark.
+ *	Hashes a service in the svc_table by <netns,proto,addr,port>
+ *	or in the svc_fwm_table by fwmark.
  *	Should be called with locked tables.
  */
 static int ip_vs_svc_hash(struct ip_vs_service *svc)
@@ -355,17 +344,17 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 
 	if (svc->fwmark == 0) {
 		/*
-		 *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
+		 *  Hash it by <netns,protocol,addr,port> in svc_table
 		 */
 		hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
 					 &svc->addr, svc->port);
-		hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
+		hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]);
 	} else {
 		/*
 		 *  Hash it by fwmark in svc_fwm_table
 		 */
 		hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
-		hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+		hlist_add_head_rcu(&svc->f_list, &svc->ipvs->svc_fwm_table[hash]);
 	}
 
 	svc->flags |= IP_VS_SVC_F_HASHED;
@@ -414,12 +403,9 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
 	/* Check for "full" addressed entries */
 	hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
 
-	hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
-		if ((svc->af == af)
-		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
-		    && (svc->port == vport)
-		    && (svc->protocol == protocol)
-		    && (svc->ipvs == ipvs)) {
+	hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
+		if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) &&
+		    svc->port == vport && svc->protocol == protocol) {
 			/* HIT */
 			return svc;
 		}
@@ -441,9 +427,8 @@ __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
 	/* Check for fwmark addressed entries */
 	hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
 
-	hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
-		if (svc->fwmark == fwmark && svc->af == af
-		    && (svc->ipvs == ipvs)) {
+	hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[hash], f_list) {
+		if (svc->fwmark == fwmark && svc->af == af) {
 			/* HIT */
 			return svc;
 		}
@@ -1701,10 +1686,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
 	 * Flush the service table hashed by <netns,protocol,addr,port>
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+		hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx],
 					  s_list) {
-			if (svc->ipvs == ipvs)
-				ip_vs_unlink_service(svc, cleanup);
+			ip_vs_unlink_service(svc, cleanup);
 		}
 	}
 
@@ -1712,10 +1696,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
 	 * Flush the service table hashed by fwmark
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+		hlist_for_each_entry_safe(svc, n, &ipvs->svc_fwm_table[idx],
 					  f_list) {
-			if (svc->ipvs == ipvs)
-				ip_vs_unlink_service(svc, cleanup);
+			ip_vs_unlink_service(svc, cleanup);
 		}
 	}
 
@@ -1732,12 +1715,12 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
 	struct net *net;
 
 	/* Check for "full" addressed entries */
-	mutex_lock(&__ip_vs_mutex);
 	list_for_each_entry(net, net_list, exit_list) {
 		ipvs = net_ipvs(net);
+		mutex_lock(&ipvs->service_mutex);
 		ip_vs_flush(ipvs, true);
+		mutex_unlock(&ipvs->service_mutex);
 	}
-	mutex_unlock(&__ip_vs_mutex);
 }
 
 /* Put all references for device (dst_cache) */
@@ -1775,25 +1758,20 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	if (event != NETDEV_DOWN || !ipvs)
 		return NOTIFY_DONE;
 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-			if (svc->ipvs == ipvs) {
-				list_for_each_entry(dest, &svc->destinations,
-						    n_list) {
-					ip_vs_forget_dev(dest, dev);
-				}
+		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
+			list_for_each_entry(dest, &svc->destinations,
+					    n_list) {
+				ip_vs_forget_dev(dest, dev);
 			}
 		}
 
-		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-			if (svc->ipvs == ipvs) {
-				list_for_each_entry(dest, &svc->destinations,
-						    n_list) {
-					ip_vs_forget_dev(dest, dev);
-				}
+		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) {
+			list_for_each_entry(dest, &svc->destinations,
+					    n_list) {
+				ip_vs_forget_dev(dest, dev);
 			}
-
 		}
 	}
 
@@ -1802,7 +1780,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 		ip_vs_forget_dev(dest, dev);
 	}
 	spin_unlock_bh(&ipvs->dest_trash_lock);
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 	return NOTIFY_DONE;
 }
 
@@ -1826,16 +1804,14 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 	struct ip_vs_service *svc;
 
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-			if (svc->ipvs == ipvs)
-				ip_vs_zero_service(svc);
+		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
+			ip_vs_zero_service(svc);
 		}
 	}
 
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-			if (svc->ipvs == ipvs)
-				ip_vs_zero_service(svc);
+		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) {
+			ip_vs_zero_service(svc);
 		}
 	}
 
@@ -2306,9 +2282,9 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 
 	/* look in hash by protocol */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
-			if ((svc->ipvs == ipvs) && pos-- == 0) {
-				iter->table = ip_vs_svc_table;
+		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) {
+			if (pos-- == 0) {
+				iter->table = ipvs->svc_table;
 				iter->bucket = idx;
 				return svc;
 			}
@@ -2317,10 +2293,10 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 
 	/* keep looking in fwmark */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+		hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[idx],
 					 f_list) {
-			if ((svc->ipvs == ipvs) && pos-- == 0) {
-				iter->table = ip_vs_svc_fwm_table;
+			if (pos-- == 0) {
+				iter->table = ipvs->svc_fwm_table;
 				iter->bucket = idx;
 				return svc;
 			}
@@ -2343,6 +2319,8 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	struct hlist_node *e;
 	struct ip_vs_iter *iter;
 	struct ip_vs_service *svc;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
@@ -2351,7 +2329,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	svc = v;
 	iter = seq->private;
 
-	if (iter->table == ip_vs_svc_table) {
+	if (iter->table == ipvs->svc_table) {
 		/* next service in table hashed by protocol */
 		e = rcu_dereference(hlist_next_rcu(&svc->s_list));
 		if (e)
@@ -2359,13 +2337,13 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
 			hlist_for_each_entry_rcu(svc,
-						 &ip_vs_svc_table[iter->bucket],
+						 &ipvs->svc_table[iter->bucket],
 						 s_list) {
 				return svc;
 			}
 		}
 
-		iter->table = ip_vs_svc_fwm_table;
+		iter->table = ipvs->svc_fwm_table;
 		iter->bucket = -1;
 		goto scan_fwmark;
 	}
@@ -2378,7 +2356,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  scan_fwmark:
 	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
 		hlist_for_each_entry_rcu(svc,
-					 &ip_vs_svc_fwm_table[iter->bucket],
+					 &ipvs->svc_fwm_table[iter->bucket],
 					 f_list)
 			return svc;
 	}
@@ -2414,7 +2392,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 
 		if (svc->ipvs != ipvs)
 			return 0;
-		if (iter->table == ip_vs_svc_table) {
+		if (iter->table == ipvs->svc_table) {
 #ifdef CONFIG_IP_VS_IPV6
 			if (svc->af == AF_INET6)
 				seq_printf(seq, "%s  [%pI6]:%04X %s ",
@@ -2736,7 +2714,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
 		return ret;
 	}
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 	if (cmd == IP_VS_SO_SET_FLUSH) {
 		/* Flush the virtual service */
 		ret = ip_vs_flush(ipvs, false);
@@ -2833,7 +2811,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
 	}
 
   out_unlock:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 	return ret;
 }
 
@@ -2871,9 +2849,9 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
 	int ret = 0;
 
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
 			/* Only expose IPv4 entries to old interface */
-			if (svc->af != AF_INET || (svc->ipvs != ipvs))
+			if (svc->af != AF_INET)
 				continue;
 
 			if (count >= get->num_services)
@@ -2890,9 +2868,9 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
 	}
 
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) {
 			/* Only expose IPv4 entries to old interface */
-			if (svc->af != AF_INET || (svc->ipvs != ipvs))
+			if (svc->af != AF_INET)
 				continue;
 
 			if (count >= get->num_services)
@@ -3061,7 +3039,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		return ret;
 	}
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 	switch (cmd) {
 	case IP_VS_SO_GET_VERSION:
 	{
@@ -3160,7 +3138,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	}
 
 out:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 	return ret;
 }
 
@@ -3395,10 +3373,10 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
-			if (++idx <= start || (svc->ipvs != ipvs))
+		hlist_for_each_entry(svc, &ipvs->svc_table[i], s_list) {
+			if (++idx <= start)
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
 				idx--;
@@ -3408,8 +3386,8 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	}
 
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
-			if (++idx <= start || (svc->ipvs != ipvs))
+		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[i], f_list) {
+			if (++idx <= start)
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
 				idx--;
@@ -3419,7 +3397,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	}
 
 nla_put_failure:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 	cb->args[0] = idx;
 
 	return skb->len;
@@ -3608,7 +3586,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 
 	/* Try to find the service for which to dump destinations */
 	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
@@ -3633,7 +3611,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	cb->args[0] = idx;
 
 out_err:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 
 	return skb->len;
 }
@@ -3919,7 +3897,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 
 	cmd = info->genlhdr->cmd;
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 
 	if (cmd == IPVS_CMD_FLUSH) {
 		ret = ip_vs_flush(ipvs, false);
@@ -4031,7 +4009,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	}
 
 out:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 
 	return ret;
 }
@@ -4061,7 +4039,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	if (!msg)
 		return -ENOMEM;
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 
 	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
 	if (reply == NULL)
@@ -4129,7 +4107,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 out_err:
 	nlmsg_free(msg);
 out:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 
 	return ret;
 }
@@ -4246,6 +4224,7 @@ static struct genl_family ip_vs_genl_family __ro_after_init = {
 	.small_ops	= ip_vs_genl_ops,
 	.n_small_ops	= ARRAY_SIZE(ip_vs_genl_ops),
 	.resv_start_op	= IPVS_CMD_FLUSH + 1,
+	.parallel_ops	= 1,
 };
 
 static int __init ip_vs_genl_register(void)
@@ -4419,6 +4398,13 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	int ret = -ENOMEM;
 	int idx;
 
+	/* Initialize service_mutex, svc_table, svc_fwm_table per netns */
+	__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		INIT_HLIST_HEAD(&ipvs->svc_table[idx]);
+		INIT_HLIST_HEAD(&ipvs->svc_fwm_table[idx]);
+	}
+
 	/* Initialize rs_table */
 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
 		INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
@@ -4523,17 +4509,8 @@ void ip_vs_unregister_nl_ioctl(void)
 
 int __init ip_vs_control_init(void)
 {
-	int idx;
 	int ret;
 
-	/* Initialize svc_table, ip_vs_svc_fwm_table */
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
-		INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-	}
-
-	smp_wmb();	/* Do we really need it now ? */
-
 	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
 	if (ret < 0)
 		return ret;
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index c5970ba416ae..323098dc8be6 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -601,7 +601,7 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
 	while (1) {
 		int max = 16;
 
-		mutex_lock(&__ip_vs_mutex);
+		mutex_lock(&ipvs->service_mutex);
 
 		while (max-- > 0) {
 			est = hlist_entry_safe(ipvs->est_temp_list.first,
@@ -621,12 +621,12 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
 			}
 			goto unlock;
 		}
-		mutex_unlock(&__ip_vs_mutex);
+		mutex_unlock(&ipvs->service_mutex);
 		cond_resched();
 	}
 
 unlock:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 }
 
 /* Calculate limits for all kthreads */
@@ -646,9 +646,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
 	u64 val;
 
 	INIT_HLIST_HEAD(&chain);
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 	kd = ipvs->est_kt_arr[0];
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 	s = kd ? kd->calc_stats : NULL;
 	if (!s)
 		goto out;
@@ -747,7 +747,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
 	if (!ip_vs_est_calc_limits(ipvs, &chain_max))
 		return;
 
-	mutex_lock(&__ip_vs_mutex);
+	mutex_lock(&ipvs->service_mutex);
 
 	/* Stop all other tasks, so that we can immediately move the
 	 * estimators to est_temp_list without RCU grace period
@@ -814,9 +814,9 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
 		/* Give chance estimators to be added (to est_temp_list)
 		 * and deleted (releasing kthread contexts)
 		 */
-		mutex_unlock(&__ip_vs_mutex);
+		mutex_unlock(&ipvs->service_mutex);
 		cond_resched();
-		mutex_lock(&__ip_vs_mutex);
+		mutex_lock(&ipvs->service_mutex);
 
 		/* Current kt released ? */
 		if (id >= ipvs->est_kt_count)
@@ -892,7 +892,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
 	mutex_unlock(&ipvs->est_mutex);
 
 unlock:
-	mutex_unlock(&__ip_vs_mutex);
+	mutex_unlock(&ipvs->service_mutex);
 }
 
 void ip_vs_zero_estimator(struct ip_vs_stats *stats)

From patchwork Tue Dec 12 16:24:33 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489592
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="CqLLstQS"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 40A30DB;
	Tue, 12 Dec 2023 08:30:37 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id C89371DE65;
	Tue, 12 Dec 2023 18:30:35 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id B12161E1EC;
	Tue, 12 Dec 2023 18:30:35 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 9F9AE3C07D4;
	Tue, 12 Dec 2023 18:30:12 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398612; bh=6oS6oaiXaH8VvLRUr8oQsy8y5zEHQigzNWQOB4Hy/zM=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=CqLLstQSigIAzOSRX6FJAwhG3qopzVykWsVEqTkKYBA4fCgfEVbzgMkIz7kc/ds4r
	 Wy1HaUpPF15MsEEUGaipQqpLb8dQvuAvjcbk2ykGzrkmhfmmfBq2ppcpBoDfp5LKLj
	 TJNgDaXuJlailitNTXrS8mDe5rXBmePu5PRCmlwk=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQUo2094058;
	Tue, 12 Dec 2023 18:26:30 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQUKS094057;
	Tue, 12 Dec 2023 18:26:30 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 03/14] ipvs: some service readers can use RCU
Date: Tue, 12 Dec 2023 18:24:33 +0200
Message-ID: <20231212162444.93801-4-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Some places walk the services under mutex but they can just use RCU:

* ip_vs_dst_event() uses ip_vs_forget_dev() which uses its own lock
  to modify dest
* ip_vs_genl_dump_services(): ip_vs_genl_fill_service() just fills skb
* ip_vs_genl_parse_service(): move RCU lock to callers
  ip_vs_genl_set_cmd(), ip_vs_genl_dump_dests() and ip_vs_genl_get_cmd()
* ip_vs_genl_dump_dests(): just fill skb

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 47 +++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7189bf6bd371..268a71f6aa97 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1758,23 +1758,21 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	if (event != NETDEV_DOWN || !ipvs)
 		return NOTIFY_DONE;
 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
-	mutex_lock(&ipvs->service_mutex);
+	rcu_read_lock();
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
-			list_for_each_entry(dest, &svc->destinations,
-					    n_list) {
+		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list)
+			list_for_each_entry_rcu(dest, &svc->destinations,
+						n_list)
 				ip_vs_forget_dev(dest, dev);
-			}
-		}
 
-		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) {
-			list_for_each_entry(dest, &svc->destinations,
-					    n_list) {
+		hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[idx], f_list)
+			list_for_each_entry_rcu(dest, &svc->destinations,
+						n_list)
 				ip_vs_forget_dev(dest, dev);
-			}
-		}
 	}
+	rcu_read_unlock();
 
+	mutex_lock(&ipvs->service_mutex);
 	spin_lock_bh(&ipvs->dest_trash_lock);
 	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
 		ip_vs_forget_dev(dest, dev);
@@ -3317,9 +3315,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	sched = rcu_dereference_protected(svc->scheduler, 1);
+	sched = rcu_dereference(svc->scheduler);
 	sched_name = sched ? sched->name : "none";
-	pe = rcu_dereference_protected(svc->pe, 1);
+	pe = rcu_dereference(svc->pe);
 	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
 	    (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
 	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
@@ -3373,9 +3371,9 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	mutex_lock(&ipvs->service_mutex);
+	rcu_read_lock();
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		hlist_for_each_entry(svc, &ipvs->svc_table[i], s_list) {
+		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) {
 			if (++idx <= start)
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3386,7 +3384,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	}
 
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[i], f_list) {
+		hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[i], f_list) {
 			if (++idx <= start)
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3397,7 +3395,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	}
 
 nla_put_failure:
-	mutex_unlock(&ipvs->service_mutex);
+	rcu_read_unlock();
 	cb->args[0] = idx;
 
 	return skb->len;
@@ -3453,13 +3451,11 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
 		usvc->fwmark = 0;
 	}
 
-	rcu_read_lock();
 	if (usvc->fwmark)
 		svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
 	else
 		svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
 					   &usvc->addr, usvc->port);
-	rcu_read_unlock();
 	*ret_svc = svc;
 
 	/* If a full entry was requested, check for the additional fields */
@@ -3586,7 +3582,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	mutex_lock(&ipvs->service_mutex);
+	rcu_read_lock();
 
 	/* Try to find the service for which to dump destinations */
 	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
@@ -3598,7 +3594,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 		goto out_err;
 
 	/* Dump the destinations */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 		if (++idx <= start)
 			continue;
 		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
@@ -3611,7 +3607,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	cb->args[0] = idx;
 
 out_err:
-	mutex_unlock(&ipvs->service_mutex);
+	rcu_read_unlock();
 
 	return skb->len;
 }
@@ -3917,9 +3913,12 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
 		need_full_svc = true;
 
+	/* We use function that requires RCU lock */
+	rcu_read_lock();
 	ret = ip_vs_genl_parse_service(ipvs, &usvc,
 				       info->attrs[IPVS_CMD_ATTR_SERVICE],
 				       need_full_svc, &svc);
+	rcu_read_unlock();
 	if (ret)
 		goto out;
 
@@ -4039,7 +4038,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	if (!msg)
 		return -ENOMEM;
 
-	mutex_lock(&ipvs->service_mutex);
+	rcu_read_lock();
 
 	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
 	if (reply == NULL)
@@ -4107,7 +4106,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 out_err:
 	nlmsg_free(msg);
 out:
-	mutex_unlock(&ipvs->service_mutex);
+	rcu_read_unlock();
 
 	return ret;
 }

From patchwork Tue Dec 12 16:24:34 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489595
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="UGjTuuCH"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 46D5EA8;
	Tue, 12 Dec 2023 08:30:49 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id DF84C1E287;
	Tue, 12 Dec 2023 18:30:47 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id C3B781E20D;
	Tue, 12 Dec 2023 18:30:47 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 094D33C07D7;
	Tue, 12 Dec 2023 18:30:13 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398613; bh=wnlryUky3WgHkhmsJ3+YMfeChV5duHSxMHOzDWRwkVs=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=UGjTuuCHwySZYQl3NeS/Fj40hR2QeKzj3GsTO9HUNwKKElvYWMqHEwKJXHRWhVaJ+
	 Pg3hC/UL0fv2tRADkL6OigqL+raG7VemQIGirBYeVpO5PQpV53JgOmgiC2SD4/FJsc
	 XQS12qBybTUxun1dKebI6hZiyMeYZ+qZPbZx8czk=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQUHI094062;
	Tue, 12 Dec 2023 18:26:30 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQUMV094061;
	Tue, 12 Dec 2023 18:26:30 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 04/14] ipvs: use single svc table
Date: Tue, 12 Dec 2023 18:24:34 +0200
Message-ID: <20231212162444.93801-5-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

fwmark based services and non-fwmark based services can be hashed
in same service table. This reduces the burden of working with two
tables.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h            |   8 +-
 net/netfilter/ipvs/ip_vs_ctl.c | 146 +++++----------------------------
 2 files changed, 22 insertions(+), 132 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 68e562bc9df2..b0a9f67a5c33 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -679,8 +679,7 @@ struct ip_vs_dest_user_kern {
  * forwarding entries.
  */
 struct ip_vs_service {
-	struct hlist_node	s_list;   /* for normal service table */
-	struct hlist_node	f_list;   /* for fwmark-based service table */
+	struct hlist_node	s_list;   /* node in service table */
 	atomic_t		refcnt;   /* reference counter */
 
 	u16			af;       /* address family */
@@ -1050,10 +1049,7 @@ struct netns_ipvs {
 
 	/* the service mutex that protect svc_table and svc_fwm_table */
 	struct mutex service_mutex;
-	/* the service table hashed by <protocol, addr, port> */
-	struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE];
-	/* the service table hashed by fwmark */
-	struct hlist_head svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+	struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE];	/* Services */
 };
 
 #define DEFAULT_SYNC_THRESHOLD	3
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 268a71f6aa97..e325e5f9d37b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -329,7 +329,7 @@ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32
 
 /*
  *	Hashes a service in the svc_table by <netns,proto,addr,port>
- *	or in the svc_fwm_table by fwmark.
+ *	or by fwmark.
  *	Should be called with locked tables.
  */
 static int ip_vs_svc_hash(struct ip_vs_service *svc)
@@ -344,18 +344,17 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 
 	if (svc->fwmark == 0) {
 		/*
-		 *  Hash it by <netns,protocol,addr,port> in svc_table
+		 *  Hash it by <netns,protocol,addr,port>
 		 */
 		hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
 					 &svc->addr, svc->port);
-		hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]);
 	} else {
 		/*
-		 *  Hash it by fwmark in svc_fwm_table
+		 *  Hash it by fwmark
 		 */
 		hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
-		hlist_add_head_rcu(&svc->f_list, &svc->ipvs->svc_fwm_table[hash]);
 	}
+	hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]);
 
 	svc->flags |= IP_VS_SVC_F_HASHED;
 	/* increase its refcnt because it is referenced by the svc table */
@@ -365,7 +364,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 
 
 /*
- *	Unhashes a service from svc_table / svc_fwm_table.
+ *	Unhashes a service from svc_table.
  *	Should be called with locked tables.
  */
 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
@@ -376,13 +375,8 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
 		return 0;
 	}
 
-	if (svc->fwmark == 0) {
-		/* Remove it from the svc_table table */
-		hlist_del_rcu(&svc->s_list);
-	} else {
-		/* Remove it from the svc_fwm_table table */
-		hlist_del_rcu(&svc->f_list);
-	}
+	/* Remove it from svc_table */
+	hlist_del_rcu(&svc->s_list);
 
 	svc->flags &= ~IP_VS_SVC_F_HASHED;
 	atomic_dec(&svc->refcnt);
@@ -405,7 +399,8 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
 
 	hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
 		if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) &&
-		    svc->port == vport && svc->protocol == protocol) {
+		    svc->port == vport && svc->protocol == protocol &&
+		    !svc->fwmark) {
 			/* HIT */
 			return svc;
 		}
@@ -427,7 +422,7 @@ __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
 	/* Check for fwmark addressed entries */
 	hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
 
-	hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[hash], f_list) {
+	hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
 		if (svc->fwmark == fwmark && svc->af == af) {
 			/* HIT */
 			return svc;
@@ -1682,26 +1677,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
 	struct ip_vs_service *svc;
 	struct hlist_node *n;
 
-	/*
-	 * Flush the service table hashed by <netns,protocol,addr,port>
-	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx],
-					  s_list) {
+					  s_list)
 			ip_vs_unlink_service(svc, cleanup);
-		}
 	}
-
-	/*
-	 * Flush the service table hashed by fwmark
-	 */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_safe(svc, n, &ipvs->svc_fwm_table[idx],
-					  f_list) {
-			ip_vs_unlink_service(svc, cleanup);
-		}
-	}
-
 	return 0;
 }
 
@@ -1764,11 +1744,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 			list_for_each_entry_rcu(dest, &svc->destinations,
 						n_list)
 				ip_vs_forget_dev(dest, dev);
-
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[idx], f_list)
-			list_for_each_entry_rcu(dest, &svc->destinations,
-						n_list)
-				ip_vs_forget_dev(dest, dev);
 	}
 	rcu_read_unlock();
 
@@ -1802,15 +1777,8 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 	struct ip_vs_service *svc;
 
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
+		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list)
 			ip_vs_zero_service(svc);
-		}
-	}
-
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) {
-			ip_vs_zero_service(svc);
-		}
 	}
 
 	ip_vs_zero_stats(&ipvs->tot_stats->s);
@@ -2246,7 +2214,6 @@ static struct ctl_table vs_vars[] = {
 
 struct ip_vs_iter {
 	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
-	struct hlist_head *table;
 	int bucket;
 };
 
@@ -2269,7 +2236,6 @@ static inline const char *ip_vs_fwd_name(unsigned int flags)
 }
 
 
-/* Get the Nth entry in the two lists */
 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 {
 	struct net *net = seq_file_net(seq);
@@ -2278,29 +2244,14 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 	int idx;
 	struct ip_vs_service *svc;
 
-	/* look in hash by protocol */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) {
 			if (pos-- == 0) {
-				iter->table = ipvs->svc_table;
-				iter->bucket = idx;
-				return svc;
-			}
-		}
-	}
-
-	/* keep looking in fwmark */
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[idx],
-					 f_list) {
-			if (pos-- == 0) {
-				iter->table = ipvs->svc_fwm_table;
 				iter->bucket = idx;
 				return svc;
 			}
 		}
 	}
-
 	return NULL;
 }
 
@@ -2327,38 +2278,17 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	svc = v;
 	iter = seq->private;
 
-	if (iter->table == ipvs->svc_table) {
-		/* next service in table hashed by protocol */
-		e = rcu_dereference(hlist_next_rcu(&svc->s_list));
-		if (e)
-			return hlist_entry(e, struct ip_vs_service, s_list);
-
-		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-			hlist_for_each_entry_rcu(svc,
-						 &ipvs->svc_table[iter->bucket],
-						 s_list) {
-				return svc;
-			}
-		}
-
-		iter->table = ipvs->svc_fwm_table;
-		iter->bucket = -1;
-		goto scan_fwmark;
-	}
-
-	/* next service in hashed by fwmark */
-	e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+	e = rcu_dereference(hlist_next_rcu(&svc->s_list));
 	if (e)
-		return hlist_entry(e, struct ip_vs_service, f_list);
+		return hlist_entry(e, struct ip_vs_service, s_list);
 
- scan_fwmark:
 	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
 		hlist_for_each_entry_rcu(svc,
-					 &ipvs->svc_fwm_table[iter->bucket],
-					 f_list)
+					 &ipvs->svc_table[iter->bucket],
+					 s_list) {
 			return svc;
+		}
 	}
-
 	return NULL;
 }
 
@@ -2380,17 +2310,12 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq,
 			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
 	} else {
-		struct net *net = seq_file_net(seq);
-		struct netns_ipvs *ipvs = net_ipvs(net);
 		const struct ip_vs_service *svc = v;
-		const struct ip_vs_iter *iter = seq->private;
 		const struct ip_vs_dest *dest;
 		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
 		char *sched_name = sched ? sched->name : "none";
 
-		if (svc->ipvs != ipvs)
-			return 0;
-		if (iter->table == ipvs->svc_table) {
+		if (!svc->fwmark) {
 #ifdef CONFIG_IP_VS_IPV6
 			if (svc->af == AF_INET6)
 				seq_printf(seq, "%s  [%pI6]:%04X %s ",
@@ -2865,24 +2790,6 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
 		}
 	}
 
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) {
-			/* Only expose IPv4 entries to old interface */
-			if (svc->af != AF_INET)
-				continue;
-
-			if (count >= get->num_services)
-				goto out;
-			memset(&entry, 0, sizeof(entry));
-			ip_vs_copy_service(&entry, svc);
-			if (copy_to_user(&uptr->entrytable[count],
-					 &entry, sizeof(entry))) {
-				ret = -EFAULT;
-				goto out;
-			}
-			count++;
-		}
-	}
 out:
 	return ret;
 }
@@ -3383,17 +3290,6 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 		}
 	}
 
-	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[i], f_list) {
-			if (++idx <= start)
-				continue;
-			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
-				idx--;
-				goto nla_put_failure;
-			}
-		}
-	}
-
 nla_put_failure:
 	rcu_read_unlock();
 	cb->args[0] = idx;
@@ -4397,12 +4293,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	int ret = -ENOMEM;
 	int idx;
 
-	/* Initialize service_mutex, svc_table, svc_fwm_table per netns */
+	/* Initialize service_mutex, svc_table per netns */
 	__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)
 		INIT_HLIST_HEAD(&ipvs->svc_table[idx]);
-		INIT_HLIST_HEAD(&ipvs->svc_fwm_table[idx]);
-	}
 
 	/* Initialize rs_table */
 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)

From patchwork Tue Dec 12 16:24:35 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489590
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="Q0f1qWr/"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 18115110;
	Tue, 12 Dec 2023 08:30:29 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id AB4E11E09C;
	Tue, 12 Dec 2023 18:30:27 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 92FCB1E1DD;
	Tue, 12 Dec 2023 18:30:27 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 64D1A3C07D2;
	Tue, 12 Dec 2023 18:30:12 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398612; bh=KjEIq9gif1Gkj7PVtooN+W8tx5iL2XwqlO2KORUTVSc=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=Q0f1qWr/yZOzsKNvVBUFRAcwO5fSmkKR5GGUWaN5rdJU+req0xVzYJD4gfP2pKMvz
	 FO5j6EgeKDaWO0Qhbp1878x1Wg08h/+h//MmWihzBHk7cv4ZPnD3z9p+lGFY1AG11f
	 hEive0ZMKcYlr3j131i/YF3bkKODgZwrlCmmer5U=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQUHb094066;
	Tue, 12 Dec 2023 18:26:30 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQU83094065;
	Tue, 12 Dec 2023 18:26:30 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 05/14] ipvs: do not keep dest_dst after dest is
 removed
Date: Tue, 12 Dec 2023 18:24:35 +0200
Message-ID: <20231212162444.93801-6-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Before now dest->dest_dst is not released when server is moved into
dest_trash list after removal. As result, we can keep dst/dev
references for long time without actively using them.

It is better to avoid walking the dest_trash list when
ip_vs_dst_event() receives dev events. So, make sure we do not
hold dev references in dest_trash list. As packets can be flying
while server is being removed, check the IP_VS_DEST_F_AVAILABLE
flag in slow path to ensure we do not save new dev references to
removed servers.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_ctl.c  | 20 +++++++----------
 net/netfilter/ipvs/ip_vs_xmit.c | 39 ++++++++++++++++++++++++---------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e325e5f9d37b..7a8c7ac94194 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -810,7 +810,6 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
 {
 	struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
 
-	__ip_vs_dst_cache_reset(dest);
 	__ip_vs_svc_put(svc);
 	call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
 }
@@ -1013,10 +1012,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 
 	dest->af = udest->af;
 
-	spin_lock_bh(&dest->dst_lock);
-	__ip_vs_dst_cache_reset(dest);
-	spin_unlock_bh(&dest->dst_lock);
-
 	if (add) {
 		list_add_rcu(&dest->n_list, &svc->destinations);
 		svc->num_dests++;
@@ -1024,6 +1019,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		if (sched && sched->add_dest)
 			sched->add_dest(svc, dest);
 	} else {
+		spin_lock_bh(&dest->dst_lock);
+		__ip_vs_dst_cache_reset(dest);
+		spin_unlock_bh(&dest->dst_lock);
+
 		sched = rcu_dereference_protected(svc->scheduler, 1);
 		if (sched && sched->upd_dest)
 			sched->upd_dest(svc, dest);
@@ -1258,6 +1257,10 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
 {
 	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
 
+	spin_lock_bh(&dest->dst_lock);
+	__ip_vs_dst_cache_reset(dest);
+	spin_unlock_bh(&dest->dst_lock);
+
 	/*
 	 *  Remove it from the d-linked destination list.
 	 */
@@ -1747,13 +1750,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	}
 	rcu_read_unlock();
 
-	mutex_lock(&ipvs->service_mutex);
-	spin_lock_bh(&ipvs->dest_trash_lock);
-	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
-		ip_vs_forget_dev(dest, dev);
-	}
-	spin_unlock_bh(&ipvs->dest_trash_lock);
-	mutex_unlock(&ipvs->service_mutex);
 	return NOTIFY_DONE;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 9193e109e6b3..d7499f1e3af2 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -317,9 +317,11 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 
 	if (dest) {
 		dest_dst = __ip_vs_dst_check(dest);
-		if (likely(dest_dst))
+		if (likely(dest_dst)) {
 			rt = (struct rtable *) dest_dst->dst_cache;
-		else {
+			if (ret_saddr)
+				*ret_saddr = dest_dst->dst_saddr.ip;
+		} else {
 			dest_dst = ip_vs_dest_dst_alloc();
 			spin_lock_bh(&dest->dst_lock);
 			if (!dest_dst) {
@@ -335,14 +337,24 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 				ip_vs_dest_dst_free(dest_dst);
 				goto err_unreach;
 			}
-			__ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+			/* It is forbidden to attach dest->dest_dst if
+			 * server is deleted. We can see the flag going down,
+			 * for very short period and it must be checked under
+			 * dst_lock.
+			 */
+			if (dest->flags & IP_VS_DEST_F_AVAILABLE)
+				__ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+			else
+				noref = 0;
 			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
 				  rcuref_read(&rt->dst.__rcuref));
+			if (ret_saddr)
+				*ret_saddr = dest_dst->dst_saddr.ip;
+			if (!noref)
+				ip_vs_dest_dst_free(dest_dst);
 		}
-		if (ret_saddr)
-			*ret_saddr = dest_dst->dst_saddr.ip;
 	} else {
 		__be32 saddr = htonl(INADDR_ANY);
 
@@ -480,9 +492,11 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 
 	if (dest) {
 		dest_dst = __ip_vs_dst_check(dest);
-		if (likely(dest_dst))
+		if (likely(dest_dst)) {
 			rt = (struct rt6_info *) dest_dst->dst_cache;
-		else {
+			if (ret_saddr)
+				*ret_saddr = dest_dst->dst_saddr.in6;
+		} else {
 			u32 cookie;
 
 			dest_dst = ip_vs_dest_dst_alloc();
@@ -503,14 +517,19 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			}
 			rt = (struct rt6_info *) dst;
 			cookie = rt6_get_cookie(rt);
-			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+			if (dest->flags & IP_VS_DEST_F_AVAILABLE)
+				__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+			else
+				noref = 0;
 			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
 				  rcuref_read(&rt->dst.__rcuref));
+			if (ret_saddr)
+				*ret_saddr = dest_dst->dst_saddr.in6;
+			if (!noref)
+				ip_vs_dest_dst_free(dest_dst);
 		}
-		if (ret_saddr)
-			*ret_saddr = dest_dst->dst_saddr.in6;
 	} else {
 		noref = 0;
 		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,

From patchwork Tue Dec 12 16:24:36 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489594
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="n+RbIFk1"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7A6D3A8;
	Tue, 12 Dec 2023 08:30:44 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 217941E1F6;
	Tue, 12 Dec 2023 18:30:43 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 08EA81E0B0;
	Tue, 12 Dec 2023 18:30:43 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id E0B353C07D6;
	Tue, 12 Dec 2023 18:30:12 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398612; bh=1syT4ItHzDrjL5pFQPfW1yRjnoMEA1fSTLgtEGKeGKw=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=n+RbIFk1k0P5CgcHBD/uJZzc0QsfVtyeitgXBbgwoUP4ag9ejyBrBAJywmwYqETmR
	 gGrSuFJ/GD69nePIltRz4MG8xkrN1y4BqtKcnaPRlx2wg0dr5QsUkJCys0QyJKcaut
	 TAwIta8BRSIunEm9s4cbqWyJO64SPlUvsov+SRWc=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQUM1094070;
	Tue, 12 Dec 2023 18:26:30 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQUje094069;
	Tue, 12 Dec 2023 18:26:30 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 06/14] ipvs: use more counters to avoid service
 lookups
Date: Tue, 12 Dec 2023 18:24:36 +0200
Message-ID: <20231212162444.93801-7-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

When new connection is created we can lookup for services multiple
times to support fallback options. We already have some counters
to skip specific lookups because it costs CPU cycles for hash
calculation, etc.

Add more counters for fwmark/non-fwmark services (fwm_services and
nonfwm_services) and make all counters per address family.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h             | 24 ++++++---
 net/netfilter/ipvs/ip_vs_core.c |  2 +-
 net/netfilter/ipvs/ip_vs_ctl.c  | 86 +++++++++++++++++++--------------
 3 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index b0a9f67a5c33..6b9b32257e10 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -271,6 +271,18 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 			pr_err(msg, ##__VA_ARGS__);			\
 	} while (0)
 
+/* For arrays per family */
+enum {
+	IP_VS_AF_INET,
+	IP_VS_AF_INET6,
+	IP_VS_AF_MAX
+};
+
+static inline int ip_vs_af_index(int af)
+{
+	return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET;
+}
+
 /* The port number of FTP service (in network order). */
 #define FTPPORT  cpu_to_be16(21)
 #define FTPDATA  cpu_to_be16(20)
@@ -940,17 +952,17 @@ struct netns_ipvs {
 	/* ip_vs_ctl */
 	struct ip_vs_stats_rcu	*tot_stats;      /* Statistics & est. */
 
-	int			num_services;    /* no of virtual services */
-	int			num_services6;   /* IPv6 virtual services */
-
 	/* Trash for destinations */
 	struct list_head	dest_trash;
 	spinlock_t		dest_trash_lock;
 	struct timer_list	dest_trash_timer; /* expiration timer */
 	/* Service counters */
-	atomic_t		ftpsvc_counter;
-	atomic_t		nullsvc_counter;
-	atomic_t		conn_out_counter;
+	atomic_t		num_services[IP_VS_AF_MAX];   /* Services */
+	atomic_t		fwm_services[IP_VS_AF_MAX];   /* Services */
+	atomic_t		nonfwm_services[IP_VS_AF_MAX];/* Services */
+	atomic_t		ftpsvc_counter[IP_VS_AF_MAX]; /* FTPPORT */
+	atomic_t		nullsvc_counter[IP_VS_AF_MAX];/* Zero port */
+	atomic_t		conn_out_counter[IP_VS_AF_MAX];/* out conn */
 
 #ifdef CONFIG_SYSCTL
 	/* delayed work for expiring no dest connections */
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index a2c16b501087..d9be2c189fb3 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1404,7 +1404,7 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat
 		return handle_response(af, skb, pd, cp, &iph, hooknum);
 
 	/* Check for real-server-started requests */
-	if (atomic_read(&ipvs->conn_out_counter)) {
+	if (atomic_read(&ipvs->conn_out_counter[ip_vs_af_index(af)])) {
 		/* Currently only for UDP:
 		 * connection oriented protocols typically use
 		 * ephemeral ports for outgoing connections, so
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7a8c7ac94194..94123a55e1bd 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -437,35 +437,42 @@ struct ip_vs_service *
 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
 		   const union nf_inet_addr *vaddr, __be16 vport)
 {
-	struct ip_vs_service *svc;
+	struct ip_vs_service *svc = NULL;
+	int af_id = ip_vs_af_index(af);
 
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark) {
+	if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) {
 		svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
 		if (svc)
 			goto out;
 	}
 
+	if (!atomic_read(&ipvs->nonfwm_services[af_id]))
+		goto out;
+
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
 	svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
+	if (svc)
+		goto out;
 
-	if (!svc && protocol == IPPROTO_TCP &&
-	    atomic_read(&ipvs->ftpsvc_counter) &&
+	if (protocol == IPPROTO_TCP &&
+	    atomic_read(&ipvs->ftpsvc_counter[af_id]) &&
 	    (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
 		/*
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
 		svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
+		if (svc)
+			goto out;
 	}
 
-	if (svc == NULL
-	    && atomic_read(&ipvs->nullsvc_counter)) {
+	if (atomic_read(&ipvs->nullsvc_counter[af_id])) {
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
@@ -1352,6 +1359,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 {
 	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
+	int af_id = ip_vs_af_index(u->af);
 	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
 	int ret_hooks = -1;
@@ -1396,8 +1404,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	}
 #endif
 
-	if ((u->af == AF_INET && !ipvs->num_services) ||
-	    (u->af == AF_INET6 && !ipvs->num_services6)) {
+	if (!atomic_read(&ipvs->num_services[af_id])) {
 		ret = ip_vs_register_hooks(ipvs, u->af);
 		if (ret < 0)
 			goto out_err;
@@ -1448,17 +1455,17 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
-		atomic_inc(&ipvs->ftpsvc_counter);
-	else if (svc->port == 0)
-		atomic_inc(&ipvs->nullsvc_counter);
+		atomic_inc(&ipvs->ftpsvc_counter[af_id]);
+	else if (!svc->port && !svc->fwmark)
+		atomic_inc(&ipvs->nullsvc_counter[af_id]);
 	if (svc->pe && svc->pe->conn_out)
-		atomic_inc(&ipvs->conn_out_counter);
+		atomic_inc(&ipvs->conn_out_counter[af_id]);
 
-	/* Count only IPv4 services for old get/setsockopt interface */
-	if (svc->af == AF_INET)
-		ipvs->num_services++;
-	else if (svc->af == AF_INET6)
-		ipvs->num_services6++;
+	if (svc->fwmark)
+		atomic_inc(&ipvs->fwm_services[af_id]);
+	else
+		atomic_inc(&ipvs->nonfwm_services[af_id]);
+	atomic_inc(&ipvs->num_services[af_id]);
 
 	/* Hash the service into the service table */
 	ip_vs_svc_hash(svc);
@@ -1503,6 +1510,8 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
 	int ret = 0;
 	bool new_pe_conn_out, old_pe_conn_out;
+	struct netns_ipvs *ipvs = svc->ipvs;
+	int af_id = ip_vs_af_index(svc->af);
 
 	/*
 	 * Lookup the scheduler, by 'u->sched_name'
@@ -1571,9 +1580,9 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 		new_pe_conn_out = (pe && pe->conn_out) ? true : false;
 		old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
 		if (new_pe_conn_out && !old_pe_conn_out)
-			atomic_inc(&svc->ipvs->conn_out_counter);
+			atomic_inc(&ipvs->conn_out_counter[af_id]);
 		if (old_pe_conn_out && !new_pe_conn_out)
-			atomic_dec(&svc->ipvs->conn_out_counter);
+			atomic_dec(&ipvs->conn_out_counter[af_id]);
 	}
 
 out:
@@ -1593,16 +1602,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 	struct ip_vs_scheduler *old_sched;
 	struct ip_vs_pe *old_pe;
 	struct netns_ipvs *ipvs = svc->ipvs;
+	int af_id = ip_vs_af_index(svc->af);
 
-	if (svc->af == AF_INET) {
-		ipvs->num_services--;
-		if (!ipvs->num_services)
-			ip_vs_unregister_hooks(ipvs, svc->af);
-	} else if (svc->af == AF_INET6) {
-		ipvs->num_services6--;
-		if (!ipvs->num_services6)
-			ip_vs_unregister_hooks(ipvs, svc->af);
-	}
+	atomic_dec(&ipvs->num_services[af_id]);
+	if (!atomic_read(&ipvs->num_services[af_id]))
+		ip_vs_unregister_hooks(ipvs, svc->af);
+	if (svc->fwmark)
+		atomic_dec(&ipvs->fwm_services[af_id]);
+	else
+		atomic_dec(&ipvs->nonfwm_services[af_id]);
 
 	ip_vs_stop_estimator(svc->ipvs, &svc->stats);
 
@@ -1614,7 +1622,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 	/* Unbind persistence engine, keep svc->pe */
 	old_pe = rcu_dereference_protected(svc->pe, 1);
 	if (old_pe && old_pe->conn_out)
-		atomic_dec(&ipvs->conn_out_counter);
+		atomic_dec(&ipvs->conn_out_counter[af_id]);
 	ip_vs_pe_put(old_pe);
 
 	/*
@@ -1629,9 +1637,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 	 *    Update the virtual service counters
 	 */
 	if (svc->port == FTPPORT)
-		atomic_dec(&ipvs->ftpsvc_counter);
-	else if (svc->port == 0)
-		atomic_dec(&ipvs->nullsvc_counter);
+		atomic_dec(&ipvs->ftpsvc_counter[af_id]);
+	else if (!svc->port && !svc->fwmark)
+		atomic_dec(&ipvs->nullsvc_counter[af_id]);
 
 	/*
 	 *    Free the service if nobody refers to it
@@ -2961,7 +2969,8 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		struct ip_vs_getinfo info;
 		info.version = IP_VS_VERSION_CODE;
 		info.size = ip_vs_conn_tab_size;
-		info.num_services = ipvs->num_services;
+		info.num_services =
+			atomic_read(&ipvs->num_services[IP_VS_AF_INET]);
 		if (copy_to_user(user, &info, sizeof(info)) != 0)
 			ret = -EFAULT;
 	}
@@ -4301,9 +4310,14 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	INIT_LIST_HEAD(&ipvs->dest_trash);
 	spin_lock_init(&ipvs->dest_trash_lock);
 	timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0);
-	atomic_set(&ipvs->ftpsvc_counter, 0);
-	atomic_set(&ipvs->nullsvc_counter, 0);
-	atomic_set(&ipvs->conn_out_counter, 0);
+	for (idx = 0; idx < IP_VS_AF_MAX; idx++) {
+		atomic_set(&ipvs->num_services[idx], 0);
+		atomic_set(&ipvs->fwm_services[idx], 0);
+		atomic_set(&ipvs->nonfwm_services[idx], 0);
+		atomic_set(&ipvs->ftpsvc_counter[idx], 0);
+		atomic_set(&ipvs->nullsvc_counter[idx], 0);
+		atomic_set(&ipvs->conn_out_counter[idx], 0);
+	}
 
 	INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
 

From patchwork Tue Dec 12 16:24:37 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489596
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="XHYOT+dY"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CAD31DB;
	Tue, 12 Dec 2023 08:30:52 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 6A4CC1E215;
	Tue, 12 Dec 2023 18:30:51 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 503321E214;
	Tue, 12 Dec 2023 18:30:51 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 28AF23C07D8;
	Tue, 12 Dec 2023 18:30:13 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398613; bh=nUfmqMbZix2T7/NbXcaz/VS7coiuRDL37o0bogHV1ZY=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=XHYOT+dYyIUjY14k0O3aYHJKhX8/8ypZESqMMUxmexCGSwvbQbGSDjRBNDPLb+6mm
	 c5PZrhnjuf83yoXz7uI4DWGl3dC5hl/U0sfOxS5gDYhAD/5V3l/bT0vJlYMTHUhybu
	 aAOrokP55V6u9fBWUiW3tJuNXpEFlHE9yMOFe7s8=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQUZG094076;
	Tue, 12 Dec 2023 18:26:30 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQUce094073;
	Tue, 12 Dec 2023 18:26:30 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 07/14] ipvs: add resizable hash tables
Date: Tue, 12 Dec 2023 18:24:37 +0200
Message-ID: <20231212162444.93801-8-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Add infrastructure for resizable hash tables based on hlist_bl
which we will use in followup patches.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h             | 197 ++++++++++++++++++++++++++++++++
 net/netfilter/ipvs/ip_vs_conn.c |   5 -
 net/netfilter/ipvs/ip_vs_core.c | 169 +++++++++++++++++++++++++++
 3 files changed, 366 insertions(+), 5 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 6b9b32257e10..51adba5f1bb9 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -11,6 +11,7 @@
 #include <asm/types.h>                  /* for __uXX types */
 
 #include <linux/list.h>                 /* for struct list_head */
+#include <linux/rculist_bl.h>           /* for struct hlist_bl_head */
 #include <linux/spinlock.h>             /* for struct rwlock_t */
 #include <linux/atomic.h>               /* for struct atomic_t */
 #include <linux/refcount.h>             /* for struct refcount_t */
@@ -30,6 +31,7 @@
 #endif
 #include <net/net_namespace.h>		/* Netw namespace */
 #include <linux/sched/isolation.h>
+#include <linux/siphash.h>
 
 #define IP_VS_HDR_INVERSE	1
 #define IP_VS_HDR_ICMP		2
@@ -271,6 +273,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 			pr_err(msg, ##__VA_ARGS__);			\
 	} while (0)
 
+struct ip_vs_aligned_lock {
+	spinlock_t	l;	/* Protect buckets */
+} ____cacheline_aligned_in_smp;
+
 /* For arrays per family */
 enum {
 	IP_VS_AF_INET,
@@ -484,6 +490,197 @@ struct ip_vs_est_kt_data {
 	int			est_row;	/* estimated row */
 };
 
+/* IPVS resizable hash tables */
+struct ip_vs_rht {
+	struct hlist_bl_head		*buckets;
+	struct ip_vs_rht __rcu		*new_tbl; /* New/Same table	*/
+	seqcount_t			*seqc;	/* Protects moves	*/
+	struct ip_vs_aligned_lock	*lock;	/* Protect seqc		*/
+	int				mask;	/* Buckets mask		*/
+	int				size;	/* Buckets		*/
+	int				seqc_mask; /* seqc mask		*/
+	int				lock_mask; /* lock mask		*/
+	u32				table_id;
+	int				u_thresh; /* upper threshold	*/
+	int				l_thresh; /* lower threshold	*/
+	int				lfactor;  /* Load Factor (shift)*/
+	int				bits;	/* size = 1 << bits	*/
+	siphash_key_t			hash_key;
+	struct rcu_head			rcu_head;
+};
+
+/**
+ * ip_vs_rht_for_each_table() - Walk the hash tables
+ * @table:	struct ip_vs_rht __rcu *table
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @p:		previous table, temp struct ip_vs_rht *var
+ *
+ * Walk tables assuming others can not change the installed tables
+ */
+#define ip_vs_rht_for_each_table(table, t, p)				\
+	for (p = NULL, t = rcu_dereference_protected(table, 1);		\
+	     t != p;							\
+	     p = t, t = rcu_dereference_protected(t->new_tbl, 1))
+
+/**
+ * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock
+ * @table:	struct ip_vs_rht __rcu *table
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @p:		previous table, temp struct ip_vs_rht *var
+ *
+ * We usually search in one table and also in second table on resizing
+ */
+#define ip_vs_rht_for_each_table_rcu(table, t, p)			\
+	for (p = NULL, t = rcu_dereference(table);			\
+	     t != p;							\
+	     p = t, t = rcu_dereference(t->new_tbl))
+
+/**
+ * ip_vs_rht_for_each_bucket() - Walk all table buckets
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @bucket:	bucket index, used as cursor, u32 var
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ */
+#define ip_vs_rht_for_each_bucket(t, bucket, head)			\
+	for (bucket = 0, head = (t)->buckets;				\
+	     bucket < t->size; bucket++, head++)
+
+/**
+ * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved
+ * @t:		current table, used as cursor, struct ip_vs_rht *var
+ * @bucket:	index of current bucket or hash key
+ * @sc:		temp seqcount_t *var
+ * @retry:	temp int var
+ */
+#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry)		\
+	for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask];	\
+	     retry && ({ seq = read_seqcount_begin(sc); 1; });		\
+	     retry = read_seqcount_retry(sc, seq))
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU()				\
+	struct ip_vs_rht *_t, *_p;					\
+	unsigned int _seq;						\
+	seqcount_t *_sc;						\
+	u32 _bucket;							\
+	int _retry
+/**
+ * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock
+ * @table:	struct ip_vs_rht __rcu *table
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_buckets_rcu(table, head)				\
+	ip_vs_rht_for_each_table_rcu(table, _t, _p)			\
+		ip_vs_rht_for_each_bucket(_t, _bucket, head)		\
+			ip_vs_rht_for_bucket_retry(_t, _bucket, _sc,	\
+						   _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_bucket_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU()				\
+	unsigned int _seq;						\
+	seqcount_t *_sc;						\
+	int _retry
+/**
+ * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock
+ * @t:		current table, struct ip_vs_rht *var
+ * @bucket:	index of current bucket or hash key
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_bucket_rcu(t, bucket, head)			\
+	if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; }))	\
+		{}							\
+	else								\
+		ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_safe_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU()			\
+	struct ip_vs_rht *_t, *_p;					\
+	u32 _bucket
+/**
+ * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock
+ * @table:	struct ip_vs_rht __rcu *table
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete entries but moving is disabled
+ * Using cond_resched_rcu() should be safe if tables do not change
+ */
+#define ip_vs_rht_walk_buckets_safe_rcu(table, head)			\
+	ip_vs_rht_for_each_table_rcu(table, _t, _p)			\
+		ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS()				\
+	struct ip_vs_rht *_t, *_p;					\
+	u32 _bucket
+
+/**
+ * ip_vs_rht_walk_buckets() - Walk all buckets
+ * @table:	struct ip_vs_rht __rcu *table
+ * @head:	bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Use if others can not add/delete/move entries
+ */
+#define ip_vs_rht_walk_buckets(table, head)				\
+	ip_vs_rht_for_each_table(table, _t, _p)				\
+		ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/* Entries can be in one of two tables, so we flip bit when new table is
+ * created and store it as highest bit in hash keys
+ */
+#define IP_VS_RHT_TABLE_ID_MASK	BIT(31)
+
+/* Check if hash key is from this table */
+static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key)
+{
+	return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK);
+}
+
+/* Build per-table hash key from hash value */
+static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash)
+{
+	return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK);
+}
+
+void ip_vs_rht_free(struct ip_vs_rht *t);
+void ip_vs_rht_rcu_free(struct rcu_head *head);
+struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks);
+int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
+			   int lfactor, int min_bits, int max_bits);
+void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
+			      int min_bits, int max_bits);
+u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
+			 const union nf_inet_addr *addr, u32 v1, u32 v2);
+
 struct dst_entry;
 struct iphdr;
 struct ip_vs_conn;
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9065da3cdd12..be3f21519cca 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -79,11 +79,6 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
 #define IP_VS_ADDRSTRLEN (8+1)
 #endif
 
-struct ip_vs_aligned_lock
-{
-	spinlock_t	l;
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
 /* lock array for conn table */
 static struct ip_vs_aligned_lock
 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index d9be2c189fb3..b54bcd0aeb1d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -118,6 +118,175 @@ void ip_vs_init_hash_table(struct list_head *table, int rows)
 		INIT_LIST_HEAD(&table[rows]);
 }
 
+/* IPVS Resizable Hash Tables:
+ * - list_bl buckets with bit lock
+ *
+ * Goals:
+ * - RCU lookup for entry can run in parallel with add/del/move operations
+ * - resizing can trigger on load change or depending on key refresh period
+ * - add/del/move operations should be allowed for any context
+ *
+ * Resizing:
+ * - new table is attached to the current table and all entries are moved
+ * with new hash key. Finally, the new table is installed as current one and
+ * the old table is released after RCU grace period.
+ * - RCU read-side critical sections will walk two tables while resizing is
+ * in progress
+ * - new entries are added to the new table
+ * - entries will be deleted from the old or from the new table, the table_id
+ * can be saved into entry as part of the hash key to know where the entry is
+ * hashed
+ * - move operations may delay readers or to cause retry for the modified
+ * bucket. As result, searched entry will be found but walkers that operate
+ * on multiple entries may see same entry twice if bucket walking is retried.
+ * - for fast path the number of entries (load) can be compared to u_thresh
+ * and l_thresh to decide when to trigger table growing/shrinking. They
+ * are calculated based on load factor (shift count), positive value allows
+ * load to be below 100% to reduce collisions by maintaining larger table
+ * while negative value tolerates collisions by using smaller table and load
+ * above 100%.
+ *
+ * Locking:
+ * - lock: protect seqc if other context except resizer can move entries
+ * - seqc: seqcount_t, delay/retry readers while entries are moved to
+ * new table on resizing
+ * - bit lock: serialize bucket modifications
+ * - writers may use other locking mechanisms to serialize operations for
+ * resizing, moving and installing new tables
+ */
+
+void ip_vs_rht_free(struct ip_vs_rht *t)
+{
+	kvfree(t->buckets);
+	kvfree(t->seqc);
+	kvfree(t->lock);
+	kfree(t);
+}
+
+void ip_vs_rht_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_rht *t;
+
+	t = container_of(head, struct ip_vs_rht, rcu_head);
+	ip_vs_rht_free(t);
+}
+
+struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks)
+{
+	struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL);
+	int i;
+
+	if (!t)
+		return NULL;
+	if (scounts) {
+		int ml = roundup_pow_of_two(nr_cpu_ids);
+
+		scounts = min(scounts, buckets);
+		scounts = min(scounts, ml);
+		t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL);
+		if (!t->seqc)
+			goto err;
+		for (i = 0; i < scounts; i++)
+			seqcount_init(&t->seqc[i]);
+
+		if (locks) {
+			locks = min(locks, scounts);
+			t->lock = kvmalloc_array(locks, sizeof(*t->lock),
+						 GFP_KERNEL);
+			if (!t->lock)
+				goto err;
+			for (i = 0; i < locks; i++)
+				spin_lock_init(&t->lock[i].l);
+		}
+	}
+
+	t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL);
+	if (!t->buckets)
+		goto err;
+	for (i = 0; i < buckets; i++)
+		INIT_HLIST_BL_HEAD(&t->buckets[i]);
+	t->mask = buckets - 1;
+	t->size = buckets;
+	t->seqc_mask = scounts - 1;
+	t->lock_mask = locks - 1;
+	t->u_thresh = buckets;
+	t->l_thresh = buckets >> 4;
+	t->bits = order_base_2(buckets);
+	/* new_tbl points to self if no new table is filled */
+	RCU_INIT_POINTER(t->new_tbl, t);
+	get_random_bytes(&t->hash_key, sizeof(t->hash_key));
+	return t;
+
+err:
+	ip_vs_rht_free(t);
+	return NULL;
+}
+
+/* Get the desired table size for n entries based on current table size
+ * lfactor: negative (u_thresh=size << -lfactor), else u_thresh=size >> lfactor
+ */
+int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
+			   int lfactor, int min_bits, int max_bits)
+{
+	if (!t)
+		return 1 << min_bits;
+	n = roundup_pow_of_two(n);
+	if (lfactor >= 0) {
+		int factor = min(lfactor, max_bits);
+
+		n = min(n, 1 << (max_bits - factor));
+		n <<= factor;
+	} else {
+		n = min(n >> -lfactor, 1 << max_bits);
+	}
+	if (lfactor != t->lfactor)
+		return clamp(n, 1 << min_bits, 1 << max_bits);
+	if (n > t->size)
+		return n;
+	if (n > t->size >> 4)
+		return t->size;
+	/* Shrink but keep it n * 2 to prevent frequent resizing */
+	return clamp(n << 1, 1 << min_bits, 1 << max_bits);
+}
+
+/* Set thresholds based on table size and load factor
+ * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink
+ */
+void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
+			      int min_bits, int max_bits)
+{
+	if (size >= 1 << max_bits)
+		t->u_thresh = INT_MAX;	/* stop growing */
+	else if (lfactor >= 0)
+		t->u_thresh = size >> min(lfactor, max_bits);
+	else
+		t->u_thresh = min(size, 1 << (30 + lfactor)) << -lfactor;
+
+	/* l_thresh: shrink when load is 16 times lower, can be 0 */
+	if (size >= 1 << max_bits)
+		t->l_thresh = (1 << max_bits) >> 4;
+	else if (size > 1 << min_bits)
+		t->l_thresh = t->u_thresh >> 4;
+	else
+		t->l_thresh = 0;	/* stop shrinking */
+}
+
+/* Return hash value for local info (fast, insecure) */
+u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
+			 const union nf_inet_addr *addr, u32 v1, u32 v2)
+{
+	u32 v3;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		v3 = ipv6_addr_hash(&addr->in6);
+	else
+#endif
+		v3 = addr->all[0];
+
+	return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]);
+}
+
 static inline void
 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 {

From patchwork Tue Dec 12 16:24:38 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489599
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="PM8vtdCJ"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 93488B0;
	Tue, 12 Dec 2023 08:31:04 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 38AA81E304;
	Tue, 12 Dec 2023 18:31:03 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 1EEDE1E0CF;
	Tue, 12 Dec 2023 18:31:03 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id AB4F53C07DB;
	Tue, 12 Dec 2023 18:30:13 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398613; bh=cMkerCG1w7slpM53KbM2I7Vc1SCFdRjWMMm/SB5BBlQ=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=PM8vtdCJXjBVW/cUn3Db3yPBMyxAhK8xXh4Ry3aYfFJpyVp/xCaJIPN6pd5UK4eht
	 34OMmiRDkhXJh7+bnPJDQGYG6mDV4rV6BCXjq8Yk8YikxUibruKfzTvu7afNep49+d
	 iyicQal7OKBZ8bxn1wZYvbOIzriMb+oSMgYqZ/LY=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQVUm094080;
	Tue, 12 Dec 2023 18:26:31 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQVlk094079;
	Tue, 12 Dec 2023 18:26:31 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 08/14] ipvs: use resizable hash table for
 services
Date: Tue, 12 Dec 2023 18:24:38 +0200
Message-ID: <20231212162444.93801-9-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Make the hash table for services resizable in the bit range of 4-20.
Table is attached only while services are present. Resizing is done
by delayed work based on load (the number of hashed services).
Table grows when load increases 2+ times (above 12.5% with factor=3)
and shrinks 8+ times when load decreases 16+ times (below 0.78%).

Switch to jhash hashing to reduce the collisions for multiple
services.

Add a hash_key field into the service that includes table ID and
bucket ID. This helps the lookup and delete operations.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h            |  49 ++-
 net/netfilter/ipvs/ip_vs_ctl.c | 676 +++++++++++++++++++++++++++------
 2 files changed, 594 insertions(+), 131 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 51adba5f1bb9..1fe04b91d4a7 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -35,12 +35,10 @@
 
 #define IP_VS_HDR_INVERSE	1
 #define IP_VS_HDR_ICMP		2
-/*
- *	Hash table: for virtual service lookups
- */
-#define IP_VS_SVC_TAB_BITS 8
-#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS)
-#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* svc_table limits */
+#define IP_VS_SVC_TAB_MIN_BITS	4
+#define IP_VS_SVC_TAB_MAX_BITS	20
 
 /* Generic access of ipvs struct */
 static inline struct netns_ipvs *net_ipvs(struct net* net)
@@ -51,8 +49,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
 
-extern struct mutex __ip_vs_mutex;
-
 struct ip_vs_iphdr {
 	int hdr_flags;	/* ipvs flags */
 	__u32 off;	/* Where IP or IPv4 header starts */
@@ -289,6 +285,12 @@ static inline int ip_vs_af_index(int af)
 	return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET;
 }
 
+/* work_flags */
+enum {
+	IP_VS_WORK_SVC_RESIZE,		/* Schedule svc_resize_work */
+	IP_VS_WORK_SVC_NORESIZE,	/* Stopping svc_resize_work */
+};
+
 /* The port number of FTP service (in network order). */
 #define FTPPORT  cpu_to_be16(21)
 #define FTPDATA  cpu_to_be16(20)
@@ -888,14 +890,15 @@ struct ip_vs_dest_user_kern {
  * forwarding entries.
  */
 struct ip_vs_service {
-	struct hlist_node	s_list;   /* node in service table */
-	atomic_t		refcnt;   /* reference counter */
-
+	struct hlist_bl_node	s_list;   /* node in service table */
+	u32			hash_key; /* Key for the hash table */
 	u16			af;       /* address family */
 	__u16			protocol; /* which protocol (TCP/UDP) */
+
 	union nf_inet_addr	addr;	  /* IP address for virtual service */
-	__be16			port;	  /* port number for the service */
 	__u32                   fwmark;   /* firewall mark of the service */
+	atomic_t		refcnt;   /* reference counter */
+	__be16			port;	  /* port number for the service */
 	unsigned int		flags;	  /* service status flags */
 	unsigned int		timeout;  /* persistent timeout in ticks */
 	__be32			netmask;  /* grouping granularity, mask/plen */
@@ -1153,6 +1156,10 @@ struct netns_ipvs {
 	struct list_head	dest_trash;
 	spinlock_t		dest_trash_lock;
 	struct timer_list	dest_trash_timer; /* expiration timer */
+	struct mutex		service_mutex;    /* service reconfig */
+	struct rw_semaphore	svc_resize_sem;   /* svc_table resizing */
+	struct delayed_work	svc_resize_work;  /* resize svc_table */
+	atomic_t		svc_table_changes;/* ++ on new table */
 	/* Service counters */
 	atomic_t		num_services[IP_VS_AF_MAX];   /* Services */
 	atomic_t		fwm_services[IP_VS_AF_MAX];   /* Services */
@@ -1216,6 +1223,7 @@ struct netns_ipvs {
 	int			sysctl_est_nice;	/* kthread nice */
 	int			est_stopped;		/* stop tasks */
 #endif
+	int			sysctl_svc_lfactor;
 
 	/* ip_vs_lblc */
 	int			sysctl_lblc_expiration;
@@ -1225,6 +1233,7 @@ struct netns_ipvs {
 	int			sysctl_lblcr_expiration;
 	struct ctl_table_header	*lblcr_ctl_header;
 	struct ctl_table	*lblcr_ctl_table;
+	unsigned long		work_flags;	/* IP_VS_WORK_* flags */
 	/* ip_vs_est */
 	struct delayed_work	est_reload_work;/* Reload kthread tasks */
 	struct mutex		est_mutex;	/* protect kthread tasks */
@@ -1256,9 +1265,7 @@ struct netns_ipvs {
 	unsigned int		mixed_address_family_dests;
 	unsigned int		hooks_afmask;	/* &1=AF_INET, &2=AF_INET6 */
 
-	/* the service mutex that protect svc_table and svc_fwm_table */
-	struct mutex service_mutex;
-	struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE];	/* Services */
+	struct ip_vs_rht __rcu	*svc_table;	/* Services */
 };
 
 #define DEFAULT_SYNC_THRESHOLD	3
@@ -1495,6 +1502,18 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
 
 #endif
 
+/* Get load factor to map num_services/u_thresh to t->size
+ * Large value decreases u_thresh to reduce collisions but increases
+ * the table size
+ * Returns factor where:
+ * - non-negative: u_thresh = size >> factor, eg. lfactor 2 = 25% load
+ * - negative: u_thresh = size << factor, eg. lfactor -1 = 200% load
+ */
+static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs)
+{
+	return READ_ONCE(ipvs->sysctl_svc_lfactor);
+}
+
 /* IPVS core functions
  * (from ip_vs_core.c)
  */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 94123a55e1bd..e2429fcc0bc3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -294,47 +294,59 @@ ip_vs_use_count_dec(void)
 }
 
 
-
+/* Service hashing:
+ * Operation			Locking order
+ * ---------------------------------------------------------------------------
+ * add table			service_mutex, svc_resize_sem(W)
+ * del table			service_mutex
+ * move between tables		svc_resize_sem(W), seqcount_t(W), bit lock
+ * add/del service		service_mutex, bit lock
+ * find service			RCU, seqcount_t(R)
+ * walk services(blocking)	service_mutex, svc_resize_sem(R)
+ * walk services(non-blocking)	RCU, seqcount_t(R)
+ *
+ * - new tables are linked/unlinked under service_mutex and svc_resize_sem
+ * - new table is linked on resizing and all operations can run in parallel
+ * in 2 tables until the new table is registered as current one
+ * - two contexts can modify buckets: config and table resize, both in
+ * process context
+ * - only table resizer can move entries, so we do not protect t->seqc[]
+ * items with t->lock[]
+ * - lookups occur under RCU lock and seqcount reader lock to detect if
+ * services are moved to new table
+ * - move operations may disturb readers: find operation will not miss entries
+ * but walkers may see same entry twice if they are forced to retry chains
+ * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
+ * service_mutex to disallow new tables to be installed or to check
+ * svc_table_changes and repeat the RCU read section if new table is installed
+ */
 
 /*
  *	Returns hash value for virtual service
  */
-static inline unsigned int
-ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
+static inline u32
+ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto,
 		  const union nf_inet_addr *addr, __be16 port)
 {
-	unsigned int porth = ntohs(port);
-	__be32 addr_fold = addr->ip;
-	__u32 ahash;
-
-#ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		addr_fold = addr->ip6[0]^addr->ip6[1]^
-			    addr->ip6[2]^addr->ip6[3];
-#endif
-	ahash = ntohl(addr_fold);
-	ahash ^= ((size_t) ipvs >> 8);
-
-	return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
-	       IP_VS_SVC_TAB_MASK;
+	return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto);
 }
 
 /*
  *	Returns hash value of fwmark for virtual service lookup
  */
-static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
+static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af,
+					__u32 fwmark)
 {
-	return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+	return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]);
 }
 
-/*
- *	Hashes a service in the svc_table by <netns,proto,addr,port>
- *	or by fwmark.
- *	Should be called with locked tables.
- */
+/* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */
 static int ip_vs_svc_hash(struct ip_vs_service *svc)
 {
-	unsigned int hash;
+	struct netns_ipvs *ipvs = svc->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t;
+	u32 hash;
 
 	if (svc->flags & IP_VS_SVC_F_HASHED) {
 		pr_err("%s(): request for already hashed, called from %pS\n",
@@ -342,23 +354,32 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 		return 0;
 	}
 
+	/* increase its refcnt because it is referenced by the svc table */
+	atomic_inc(&svc->refcnt);
+
+	/* New entries go into recent table */
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	t = rcu_dereference_protected(t->new_tbl, 1);
+
 	if (svc->fwmark == 0) {
 		/*
-		 *  Hash it by <netns,protocol,addr,port>
+		 *  Hash it by <protocol,addr,port>
 		 */
-		hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
+		hash = ip_vs_svc_hashval(t, svc->af, svc->protocol,
 					 &svc->addr, svc->port);
 	} else {
 		/*
 		 *  Hash it by fwmark
 		 */
-		hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
+		hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark);
 	}
-	hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]);
-
+	head = t->buckets + (hash & t->mask);
+	hlist_bl_lock(head);
+	WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash));
 	svc->flags |= IP_VS_SVC_F_HASHED;
-	/* increase its refcnt because it is referenced by the svc table */
-	atomic_inc(&svc->refcnt);
+	hlist_bl_add_head_rcu(&svc->s_list, head);
+	hlist_bl_unlock(head);
+
 	return 1;
 }
 
@@ -369,17 +390,45 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
  */
 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
 {
+	struct netns_ipvs *ipvs = svc->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t;
+	u32 hash_key2;
+	u32 hash_key;
+
 	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
 		pr_err("%s(): request for unhash flagged, called from %pS\n",
 		       __func__, __builtin_return_address(0));
 		return 0;
 	}
 
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	hash_key = READ_ONCE(svc->hash_key);
+	/* We need to lock the bucket in the right table */
+	if (ip_vs_rht_same_table(t, hash_key)) {
+		head = t->buckets + (hash_key & t->mask);
+		hlist_bl_lock(head);
+		/* Ensure hash_key is read under lock */
+		hash_key2 = READ_ONCE(svc->hash_key);
+		/* Moved to new table ? */
+		if (hash_key != hash_key2) {
+			hlist_bl_unlock(head);
+			t = rcu_dereference_protected(t->new_tbl, 1);
+			head = t->buckets + (hash_key2 & t->mask);
+			hlist_bl_lock(head);
+		}
+	} else {
+		/* It is already moved to new table */
+		t = rcu_dereference_protected(t->new_tbl, 1);
+		head = t->buckets + (hash_key & t->mask);
+		hlist_bl_lock(head);
+	}
 	/* Remove it from svc_table */
-	hlist_del_rcu(&svc->s_list);
+	hlist_bl_del_rcu(&svc->s_list);
 
 	svc->flags &= ~IP_VS_SVC_F_HASHED;
 	atomic_dec(&svc->refcnt);
+	hlist_bl_unlock(head);
 	return 1;
 }
 
@@ -391,18 +440,29 @@ static inline struct ip_vs_service *
 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
 		     const union nf_inet_addr *vaddr, __be16 vport)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
-
-	/* Check for "full" addressed entries */
-	hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
-
-	hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
-		if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) &&
-		    svc->port == vport && svc->protocol == protocol &&
-		    !svc->fwmark) {
-			/* HIT */
-			return svc;
+	struct ip_vs_rht *t, *p;
+	struct hlist_bl_node *e;
+	u32 hash, hash_key;
+
+	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
+		/* Check for "full" addressed entries */
+		hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport);
+
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
+				if (READ_ONCE(svc->hash_key) == hash_key &&
+				    svc->af == af &&
+				    ip_vs_addr_equal(af, &svc->addr, vaddr) &&
+				    svc->port == vport &&
+				    svc->protocol == protocol && !svc->fwmark) {
+					/* HIT */
+					return svc;
+				}
+			}
 		}
 	}
 
@@ -416,16 +476,26 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
 static inline struct ip_vs_service *
 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
-
-	/* Check for fwmark addressed entries */
-	hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
-
-	hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
-		if (svc->fwmark == fwmark && svc->af == af) {
-			/* HIT */
-			return svc;
+	struct ip_vs_rht *t, *p;
+	struct hlist_bl_node *e;
+	u32 hash, hash_key;
+
+	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
+		/* Check for fwmark addressed entries */
+		hash = ip_vs_svc_fwm_hashval(t, af, fwmark);
+
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
+				if (READ_ONCE(svc->hash_key) == hash_key &&
+				    svc->fwmark == fwmark && svc->af == af) {
+					/* HIT */
+					return svc;
+				}
+			}
 		}
 	}
 
@@ -488,6 +558,220 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
 	return svc;
 }
 
+/* Return the number of registered services */
+static int ip_vs_get_num_services(struct netns_ipvs *ipvs)
+{
+	int ns = 0, ni = IP_VS_AF_MAX;
+
+	while (--ni >= 0)
+		ns += atomic_read(&ipvs->num_services[ni]);
+	return ns;
+}
+
+/* Get default load factor to map num_services/u_thresh to t->size */
+static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs)
+{
+	int factor;
+
+	if (net_eq(ipvs->net, &init_net))
+		factor = 3;	/* grow if load is above 12.5% */
+	else
+		factor = 2;	/* grow if load is above 25% */
+	return factor;
+}
+
+/* Get the desired svc_table size */
+static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
+				  int lfactor)
+{
+	return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs),
+				      lfactor, IP_VS_SVC_TAB_MIN_BITS,
+				      IP_VS_SVC_TAB_MAX_BITS);
+}
+
+/* Allocate svc_table */
+static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs,
+					       int buckets, int lfactor)
+{
+	struct ip_vs_rht *t;
+	int scounts, locks;
+
+	/* No frequent lookups to race with resizing, so use max of 64
+	 * seqcounts. Only resizer moves entries, so use 0 locks.
+	 */
+	scounts = clamp(buckets >> 4, 1, 64);
+	locks = 0;
+
+	t = ip_vs_rht_alloc(buckets, scounts, locks);
+	if (!t)
+		return NULL;
+	t->lfactor = lfactor;
+	ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS,
+				 IP_VS_SVC_TAB_MAX_BITS);
+	return t;
+}
+
+/* svc_table resizer work */
+static void svc_resize_work_handler(struct work_struct *work)
+{
+	struct hlist_bl_head *head, *head2;
+	struct ip_vs_rht *t_free = NULL;
+	unsigned int resched_score = 0;
+	struct hlist_bl_node *cn, *nn;
+	struct ip_vs_rht *t, *t_new;
+	struct ip_vs_service *svc;
+	struct netns_ipvs *ipvs;
+	bool more_work = true;
+	seqcount_t *sc;
+	int limit = 0;
+	int new_size;
+	int lfactor;
+	u32 bucket;
+
+	ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work);
+
+	if (!down_write_trylock(&ipvs->svc_resize_sem))
+		goto out;
+	if (!mutex_trylock(&ipvs->service_mutex))
+		goto unlock_sem;
+	more_work = false;
+	clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
+	if (!ipvs->enable ||
+	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		goto unlock_m;
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	/* Do nothing if table is removed */
+	if (!t)
+		goto unlock_m;
+	/* New table needs to be registered? BUG! */
+	if (t != rcu_dereference_protected(t->new_tbl, 1))
+		goto unlock_m;
+
+	lfactor = sysctl_svc_lfactor(ipvs);
+	/* Should we resize ? */
+	new_size = ip_vs_svc_desired_size(ipvs, t, lfactor);
+	if (new_size == t->size && lfactor == t->lfactor)
+		goto unlock_m;
+
+	t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
+	if (!t_new) {
+		more_work = true;
+		goto unlock_m;
+	}
+	/* Flip the table_id */
+	t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
+
+	rcu_assign_pointer(t->new_tbl, t_new);
+	/* Allow add/del to new_tbl while moving from old table */
+	mutex_unlock(&ipvs->service_mutex);
+
+	ip_vs_rht_for_each_bucket(t, bucket, head) {
+same_bucket:
+		if (++limit >= 16) {
+			if (!ipvs->enable ||
+			    test_bit(IP_VS_WORK_SVC_NORESIZE,
+				     &ipvs->work_flags))
+				goto unlock_sem;
+			if (resched_score >= 100) {
+				resched_score = 0;
+				cond_resched();
+			}
+			limit = 0;
+		}
+		if (hlist_bl_empty(head)) {
+			resched_score++;
+			continue;
+		}
+		/* Preemption calls ahead... */
+		resched_score = 0;
+
+		sc = &t->seqc[bucket & t->seqc_mask];
+		/* seqcount_t usage considering PREEMPT_RT rules:
+		 * - we are the only writer => preemption can be allowed
+		 * - readers (SoftIRQ) => disable BHs
+		 * - readers (processes) => preemption should be disabled
+		 */
+		local_bh_disable();
+		preempt_disable_nested();
+		write_seqcount_begin(sc);
+		hlist_bl_lock(head);
+
+		hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) {
+			u32 hash;
+
+			/* New hash for the new table */
+			if (svc->fwmark == 0) {
+				/*  Hash it by <protocol,addr,port> */
+				hash = ip_vs_svc_hashval(t_new, svc->af,
+							 svc->protocol,
+							 &svc->addr, svc->port);
+			} else {
+				/* Hash it by fwmark */
+				hash = ip_vs_svc_fwm_hashval(t_new, svc->af,
+							     svc->fwmark);
+			}
+			hlist_bl_del_rcu(&svc->s_list);
+			head2 = t_new->buckets + (hash & t_new->mask);
+
+			hlist_bl_lock(head2);
+			WRITE_ONCE(svc->hash_key,
+				   ip_vs_rht_build_hash_key(t_new, hash));
+			/* t_new->seqc are not used at this stage, we race
+			 * only with add/del, so only lock the bucket.
+			 */
+			hlist_bl_add_head_rcu(&svc->s_list, head2);
+			hlist_bl_unlock(head2);
+			/* Too long chain? Do it in steps */
+			if (++limit >= 64)
+				break;
+		}
+
+		hlist_bl_unlock(head);
+		write_seqcount_end(sc);
+		preempt_enable_nested();
+		local_bh_enable();
+		if (limit >= 64)
+			goto same_bucket;
+	}
+
+	/* Tables can be switched only under service_mutex */
+	while (!mutex_trylock(&ipvs->service_mutex)) {
+		cond_resched();
+		if (!ipvs->enable ||
+		    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+			goto unlock_sem;
+	}
+	if (!ipvs->enable ||
+	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		goto unlock_sem;
+
+	rcu_assign_pointer(ipvs->svc_table, t_new);
+	/* Inform readers that new table is installed */
+	smp_mb__before_atomic();
+	atomic_inc(&ipvs->svc_table_changes);
+	t_free = t;
+
+unlock_m:
+	mutex_unlock(&ipvs->service_mutex);
+
+unlock_sem:
+	up_write(&ipvs->svc_resize_sem);
+
+	if (t_free) {
+		/* RCU readers should not see more than two tables in chain.
+		 * To prevent new table to be attached wait here instead of
+		 * freeing the old table in RCU callback.
+		 */
+		synchronize_rcu();
+		ip_vs_rht_free(t_free);
+	}
+
+out:
+	if (!ipvs->enable || !more_work ||
+	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		return;
+	queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
+}
 
 static inline void
 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
@@ -1357,12 +1641,13 @@ static int
 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		  struct ip_vs_service **svc_p)
 {
-	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_rht *t, *t_new = NULL;
 	int af_id = ip_vs_af_index(u->af);
-	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
+	struct ip_vs_pe *pe = NULL;
 	int ret_hooks = -1;
+	int ret = 0;
 
 	/* increase the module use count */
 	if (!ip_vs_use_count_inc())
@@ -1404,6 +1689,18 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	}
 #endif
 
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	if (!t) {
+		int lfactor = sysctl_svc_lfactor(ipvs);
+		int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
+
+		t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
+		if (!t_new) {
+			ret = -ENOMEM;
+			goto out_err;
+		}
+	}
+
 	if (!atomic_read(&ipvs->num_services[af_id])) {
 		ret = ip_vs_register_hooks(ipvs, u->af);
 		if (ret < 0)
@@ -1453,6 +1750,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	RCU_INIT_POINTER(svc->pe, pe);
 	pe = NULL;
 
+	if (t_new) {
+		clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
+		rcu_assign_pointer(ipvs->svc_table, t_new);
+		t_new = NULL;
+	}
+
+	/* Hash the service into the service table */
+	ip_vs_svc_hash(svc);
+
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
 		atomic_inc(&ipvs->ftpsvc_counter[af_id]);
@@ -1467,8 +1773,11 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		atomic_inc(&ipvs->nonfwm_services[af_id]);
 	atomic_inc(&ipvs->num_services[af_id]);
 
-	/* Hash the service into the service table */
-	ip_vs_svc_hash(svc);
+	/* Schedule resize work */
+	if (t && ip_vs_get_num_services(ipvs) > t->u_thresh &&
+	    !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
+		queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
+				   1);
 
 	*svc_p = svc;
 
@@ -1484,6 +1793,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 
 
  out_err:
+	if (t_new)
+		ip_vs_rht_free(t_new);
 	if (ret_hooks >= 0)
 		ip_vs_unregister_hooks(ipvs, u->af);
 	if (svc != NULL) {
@@ -1671,10 +1982,38 @@ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
  */
 static int ip_vs_del_service(struct ip_vs_service *svc)
 {
+	struct netns_ipvs *ipvs;
+	struct ip_vs_rht *t, *p;
+	int ns;
+
 	if (svc == NULL)
 		return -EEXIST;
+	ipvs = svc->ipvs;
 	ip_vs_unlink_service(svc, false);
-
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+
+	/* Drop the table if no more services */
+	ns = ip_vs_get_num_services(ipvs);
+	if (!ns) {
+		/* Stop the resizer and drop the tables */
+		set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
+		cancel_delayed_work_sync(&ipvs->svc_resize_work);
+		if (t) {
+			rcu_assign_pointer(ipvs->svc_table, NULL);
+			while (1) {
+				p = rcu_dereference_protected(t->new_tbl, 1);
+				call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
+				if (p == t)
+					break;
+				t = p;
+			}
+		}
+	} else if (ns <= t->l_thresh &&
+		   !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
+				     &ipvs->work_flags)) {
+		queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
+				   1);
+	}
 	return 0;
 }
 
@@ -1684,14 +2023,36 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
  */
 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS();
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
-	struct hlist_node *n;
+	struct hlist_bl_node *ne;
+	struct hlist_bl_node *e;
+	struct ip_vs_rht *t, *p;
+
+	/* Stop the resizer and drop the tables */
+	if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
+		cancel_delayed_work_sync(&ipvs->svc_resize_work);
+	/* No resizer, so now we have exclusive write access */
+
+	if (ip_vs_get_num_services(ipvs)) {
+		ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
+			hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list)
+				ip_vs_unlink_service(svc, cleanup);
+		}
+	}
 
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx],
-					  s_list)
-			ip_vs_unlink_service(svc, cleanup);
+	/* Unregister the hash table and release it after RCU grace period */
+	t = rcu_dereference_protected(ipvs->svc_table, 1);
+	if (t) {
+		rcu_assign_pointer(ipvs->svc_table, NULL);
+		while (1) {
+			p = rcu_dereference_protected(t->new_tbl, 1);
+			call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
+			if (p == t)
+				break;
+			t = p;
+		}
 	}
 	return 0;
 }
@@ -1742,19 +2103,44 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct net *net = dev_net(dev);
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
+	unsigned int resched_score = 0;
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
 	struct ip_vs_dest *dest;
-	unsigned int idx;
+	int old_gen, new_gen;
 
 	if (event != NETDEV_DOWN || !ipvs)
 		return NOTIFY_DONE;
 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+
+	old_gen = atomic_read(&ipvs->svc_table_changes);
+
 	rcu_read_lock();
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list)
+
+repeat:
+	smp_rmb(); /* ipvs->svc_table and svc_table_changes */
+	ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
 			list_for_each_entry_rcu(dest, &svc->destinations,
-						n_list)
+						n_list) {
 				ip_vs_forget_dev(dest, dev);
+				resched_score += 10;
+			}
+			resched_score++;
+		}
+		resched_score++;
+		if (resched_score >= 100) {
+			resched_score = 0;
+			cond_resched_rcu();
+			new_gen = atomic_read(&ipvs->svc_table_changes);
+			/* New table installed ? */
+			if (old_gen != new_gen) {
+				old_gen = new_gen;
+				goto repeat;
+			}
+		}
 	}
 	rcu_read_unlock();
 
@@ -1777,14 +2163,28 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
 
 static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
+	unsigned int resched_score = 0;
+	struct hlist_bl_head *head;
 	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+
+	rcu_read_lock();
 
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list)
+	ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
 			ip_vs_zero_service(svc);
+			resched_score += 10;
+		}
+		resched_score++;
+		if (resched_score >= 100) {
+			resched_score = 0;
+			cond_resched_rcu();
+		}
 	}
 
+	rcu_read_unlock();
+
 	ip_vs_zero_stats(&ipvs->tot_stats->s);
 	return 0;
 }
@@ -2218,7 +2618,8 @@ static struct ctl_table vs_vars[] = {
 
 struct ip_vs_iter {
 	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
-	int bucket;
+	struct ip_vs_rht *t;
+	u32 bucket;
 };
 
 /*
@@ -2239,17 +2640,23 @@ static inline const char *ip_vs_fwd_name(unsigned int flags)
 	}
 }
 
-
+/* Do not expect consistent view during add, del and move(table resize).
+ * We may miss entries and even show duplicates.
+ */
 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 {
-	struct net *net = seq_file_net(seq);
-	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_iter *iter = seq->private;
-	int idx;
+	struct ip_vs_rht *t = iter->t;
 	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+	int idx;
 
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) {
+	if (!t)
+		return NULL;
+	for (idx = 0; idx < t->size; idx++) {
+		hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) {
+			if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
+				break;
 			if (pos-- == 0) {
 				iter->bucket = idx;
 				return svc;
@@ -2262,18 +2669,22 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
+	struct ip_vs_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	rcu_read_lock();
+	iter->t = rcu_dereference(ipvs->svc_table);
 	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
 
 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct hlist_node *e;
-	struct ip_vs_iter *iter;
 	struct ip_vs_service *svc;
-	struct net *net = seq_file_net(seq);
-	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_iter *iter;
+	struct hlist_bl_node *e;
+	struct ip_vs_rht *t;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
@@ -2281,15 +2692,22 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	svc = v;
 	iter = seq->private;
+	t = iter->t;
+	if (!t)
+		return NULL;
 
-	e = rcu_dereference(hlist_next_rcu(&svc->s_list));
-	if (e)
-		return hlist_entry(e, struct ip_vs_service, s_list);
+	hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) {
+		/* Our cursor was moved to new table ? */
+		if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
+			break;
+		return svc;
+	}
 
-	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-		hlist_for_each_entry_rcu(svc,
-					 &ipvs->svc_table[iter->bucket],
-					 s_list) {
+	while (++iter->bucket < t->size) {
+		hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket],
+					    s_list) {
+			if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
+				break;
 			return svc;
 		}
 	}
@@ -2770,13 +3188,18 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
 			    const struct ip_vs_get_services *get,
 			    struct ip_vs_get_services __user *uptr)
 {
-	int idx, count=0;
-	struct ip_vs_service *svc;
 	struct ip_vs_service_entry entry;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS();
+	struct hlist_bl_head *head;
+	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+	int count = 0;
 	int ret = 0;
 
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
+	lockdep_assert_held(&ipvs->svc_resize_sem);
+	/* All service modifications are disabled, go ahead */
+	ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry(svc, e, head, s_list) {
 			/* Only expose IPv4 entries to old interface */
 			if (svc->af != AF_INET)
 				continue;
@@ -2948,6 +3371,35 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		return ret;
 	}
 
+	if (cmd == IP_VS_SO_GET_SERVICES) {
+		struct ip_vs_get_services *get;
+		int size;
+
+		get = (struct ip_vs_get_services *)arg;
+		size = struct_size(get, entrytable, get->num_services);
+		if (*len != size) {
+			pr_err("length: %u != %u\n", *len, size);
+			return -EINVAL;
+		}
+		/* Protect against table resizer moving the entries.
+		 * Try reverse locking, so that we do not hold the mutex
+		 * while waiting for semaphore.
+		 */
+		while (1) {
+			ret = down_read_killable(&ipvs->svc_resize_sem);
+			if (ret < 0)
+				return ret;
+			if (mutex_trylock(&ipvs->service_mutex))
+				break;
+			up_read(&ipvs->svc_resize_sem);
+			cond_resched();
+		}
+		ret = __ip_vs_get_service_entries(ipvs, get, user);
+		up_read(&ipvs->svc_resize_sem);
+		mutex_unlock(&ipvs->service_mutex);
+		return ret;
+	}
+
 	mutex_lock(&ipvs->service_mutex);
 	switch (cmd) {
 	case IP_VS_SO_GET_VERSION:
@@ -2976,22 +3428,6 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	}
 	break;
 
-	case IP_VS_SO_GET_SERVICES:
-	{
-		struct ip_vs_get_services *get;
-		int size;
-
-		get = (struct ip_vs_get_services *)arg;
-		size = struct_size(get, entrytable, get->num_services);
-		if (*len != size) {
-			pr_err("length: %u != %u\n", *len, size);
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = __ip_vs_get_service_entries(ipvs, get, user);
-	}
-	break;
-
 	case IP_VS_SO_GET_SERVICE:
 	{
 		struct ip_vs_service_entry *entry;
@@ -3277,15 +3713,19 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb,
 static int ip_vs_genl_dump_services(struct sk_buff *skb,
 				    struct netlink_callback *cb)
 {
-	int idx = 0, i;
-	int start = cb->args[0];
-	struct ip_vs_service *svc;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
 	struct net *net = sock_net(skb->sk);
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct hlist_bl_head *head;
+	struct ip_vs_service *svc;
+	struct hlist_bl_node *e;
+	int start = cb->args[0];
+	int idx = 0;
 
+	down_read(&ipvs->svc_resize_sem);
 	rcu_read_lock();
-	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) {
+	ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
+		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
 			if (++idx <= start)
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3297,6 +3737,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 
 nla_put_failure:
 	rcu_read_unlock();
+	up_read(&ipvs->svc_resize_sem);
 	cb->args[0] = idx;
 
 	return skb->len;
@@ -4300,8 +4741,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 
 	/* Initialize service_mutex, svc_table per netns */
 	__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
-	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)
-		INIT_HLIST_HEAD(&ipvs->svc_table[idx]);
+	init_rwsem(&ipvs->svc_resize_sem);
+	INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
+	atomic_set(&ipvs->svc_table_changes, 0);
+	RCU_INIT_POINTER(ipvs->svc_table, NULL);
 
 	/* Initialize rs_table */
 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4320,6 +4763,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	}
 
 	INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
+	ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs);
 
 	/* procfs stats */
 	ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);

From patchwork Tue Dec 12 16:24:39 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489600
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="H5/Y/kNt"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A41ADDC;
	Tue, 12 Dec 2023 08:31:08 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 3EDA81E319;
	Tue, 12 Dec 2023 18:31:07 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 23CEE1E220;
	Tue, 12 Dec 2023 18:31:07 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id CDF663C07DC;
	Tue, 12 Dec 2023 18:30:13 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398613; bh=nH5MgXzydIfzypRF2rvDALvpPTT6ylLbHZaHXy4G7M4=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=H5/Y/kNt9C6EVzTQtU6qQv23dO5Ato5y7yj4uBAVmrTjCjG92FgLx5pSCmZsi+jBj
	 VvWZ5UGh48ZbdauR6WP683eNS+lef7BTwi5wmzt3f/IL/UrtPdqVR0OeCXaq3hotua
	 WCxgogd4gesCeL+JLlp1LsQOSaCWmLP965hV4juU=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQVNF094084;
	Tue, 12 Dec 2023 18:26:31 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQVkk094083;
	Tue, 12 Dec 2023 18:26:31 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 09/14] ipvs: switch to per-net connection table
Date: Tue, 12 Dec 2023 18:24:39 +0200
Message-ID: <20231212162444.93801-10-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Use per-net resizable hash table for connections. The global table
is slow to walk when using many namespaces.

The table can be resized in the range of [256 - ip_vs_conn_tab_size].
Table is attached only while services are present. Resizing is done
by delayed work based on load (the number of connections).

Add a hash_key field into the connection that includes table ID and
bucket ID. This helps the lookup and delete operations.
ip_vs_conn_unlink() will not calculate the hash but will use and
validate the saved hash_key under lock.

For better security switch from jhash to siphash for the default
connection hashing but the persistence engines may use their own
function. Keeping the hash table loaded with entries below the
size (12%) allows to avoid collision for 96+% of the conns.

ip_vs_conn_fill_cport() now will rehash the connection with proper
locking because unhash+hash is not safe for RCU readers.

To invalidate the templates setting just dport to 0xffff is enough,
no need to rehash them. As result, ip_vs_conn_unhash() is now
unused and removed.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h               |  34 +-
 net/netfilter/ipvs/ip_vs_conn.c   | 868 +++++++++++++++++++++---------
 net/netfilter/ipvs/ip_vs_ctl.c    |  18 +
 net/netfilter/ipvs/ip_vs_pe_sip.c |   4 +-
 net/netfilter/ipvs/ip_vs_sync.c   |  23 +
 5 files changed, 679 insertions(+), 268 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 1fe04b91d4a7..f5605c289bba 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -36,6 +36,14 @@
 #define IP_VS_HDR_INVERSE	1
 #define IP_VS_HDR_ICMP		2
 
+/* conn_tab limits (as per Kconfig) */
+#define IP_VS_CONN_TAB_MIN_BITS	8
+#if BITS_PER_LONG > 32
+#define IP_VS_CONN_TAB_MAX_BITS	27
+#else
+#define IP_VS_CONN_TAB_MAX_BITS	20
+#endif
+
 /* svc_table limits */
 #define IP_VS_SVC_TAB_MIN_BITS	4
 #define IP_VS_SVC_TAB_MAX_BITS	20
@@ -289,6 +297,7 @@ static inline int ip_vs_af_index(int af)
 enum {
 	IP_VS_WORK_SVC_RESIZE,		/* Schedule svc_resize_work */
 	IP_VS_WORK_SVC_NORESIZE,	/* Stopping svc_resize_work */
+	IP_VS_WORK_CONN_RESIZE,		/* Schedule conn_resize_work */
 };
 
 /* The port number of FTP service (in network order). */
@@ -778,18 +787,19 @@ struct ip_vs_conn_param {
 
 /* IP_VS structure allocated for each dynamically scheduled connection */
 struct ip_vs_conn {
-	struct hlist_node	c_list;         /* hashed list heads */
+	struct hlist_bl_node	c_list;         /* node in conn_tab */
+	__u32			hash_key;	/* Key for the hash table */
 	/* Protocol, addresses and port numbers */
 	__be16                  cport;
 	__be16                  dport;
 	__be16                  vport;
 	u16			af;		/* address family */
+	__u16                   protocol;       /* Which protocol (TCP/UDP) */
+	__u16			daf;		/* Address family of the dest */
 	union nf_inet_addr      caddr;          /* client address */
 	union nf_inet_addr      vaddr;          /* virtual address */
 	union nf_inet_addr      daddr;          /* destination address */
 	volatile __u32          flags;          /* status flags */
-	__u16                   protocol;       /* Which protocol (TCP/UDP) */
-	__u16			daf;		/* Address family of the dest */
 	struct netns_ipvs	*ipvs;
 
 	/* counter and timer */
@@ -1008,8 +1018,8 @@ struct ip_vs_pe {
 	int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
 	bool (*ct_match)(const struct ip_vs_conn_param *p,
 			 struct ip_vs_conn *ct);
-	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
-			   bool inverse);
+	u32 (*hashkey_raw)(const struct ip_vs_conn_param *p,
+			   struct ip_vs_rht *t, bool inverse);
 	int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
 	/* create connections for real-server outgoing packets */
 	struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc,
@@ -1148,6 +1158,7 @@ struct netns_ipvs {
 #endif
 	/* ip_vs_conn */
 	atomic_t		conn_count;      /* connection counter */
+	struct delayed_work	conn_resize_work;/* resize conn_tab */
 
 	/* ip_vs_ctl */
 	struct ip_vs_stats_rcu	*tot_stats;      /* Statistics & est. */
@@ -1223,6 +1234,7 @@ struct netns_ipvs {
 	int			sysctl_est_nice;	/* kthread nice */
 	int			est_stopped;		/* stop tasks */
 #endif
+	int			sysctl_conn_lfactor;
 	int			sysctl_svc_lfactor;
 
 	/* ip_vs_lblc */
@@ -1266,6 +1278,8 @@ struct netns_ipvs {
 	unsigned int		hooks_afmask;	/* &1=AF_INET, &2=AF_INET6 */
 
 	struct ip_vs_rht __rcu	*svc_table;	/* Services */
+	struct ip_vs_rht __rcu	*conn_tab;	/* Connections */
+	atomic_t		conn_tab_changes;/* ++ on new table */
 };
 
 #define DEFAULT_SYNC_THRESHOLD	3
@@ -1502,6 +1516,12 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
 
 #endif
 
+/* Get load factor to map conn_count/u_thresh to t->size */
+static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs)
+{
+	return READ_ONCE(ipvs->sysctl_conn_lfactor);
+}
+
 /* Get load factor to map num_services/u_thresh to t->size
  * Large value decreases u_thresh to reduce collisions but increases
  * the table size
@@ -1587,6 +1607,10 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
 }
 void ip_vs_conn_put(struct ip_vs_conn *cp);
 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
+int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
+			    int lfactor);
+struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
+				       int lfactor);
 
 struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 				  const union nf_inet_addr *daddr,
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index be3f21519cca..4c03736a602a 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -47,14 +47,8 @@ static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
 
-/* size and mask values */
+/* Max table size */
 int ip_vs_conn_tab_size __read_mostly;
-static int ip_vs_conn_tab_mask __read_mostly;
-
-/*
- *  Connection hash table: for input and output packets lookups of IPVS
- */
-static struct hlist_head *ip_vs_conn_tab __read_mostly;
 
 /*  SLAB cache for IPVS connections */
 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
@@ -62,16 +56,6 @@ static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
 /*  counter for no client port connections */
 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
 
-/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd __read_mostly;
-
-/*
- *  Fine locking granularity for big connection hash table
- */
-#define CT_LOCKARRAY_BITS  5
-#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
-#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
-
 /* We need an addrstrlen that works with or without v6 */
 #ifdef CONFIG_IP_VS_IPV6
 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
@@ -79,18 +63,61 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
 #define IP_VS_ADDRSTRLEN (8+1)
 #endif
 
-/* lock array for conn table */
-static struct ip_vs_aligned_lock
-__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+/* Connection hashing:
+ * - hash (add conn) and unhash (del conn) are safe for RCU readers walking
+ * the bucket, they will not jump to another bucket or hash table and to miss
+ * conns
+ * - rehash (fill cport) hashes the conn to new bucket or even new table,
+ * so we use seqcount to retry lookups on buckets where we delete
+ * conns (unhash) because after hashing their next ptr can point to another
+ * bucket or hash table
+ * - hash table resize works like rehash but always rehashes into new table
+ * - bit lock on bucket serializes all operations that modify the chain
+ * - cp->lock protects conn fields like cp->flags, cp->dest
+ */
 
-static inline void ct_write_lock_bh(unsigned int key)
+/* Lock conn_tab bucket for conn hash/unhash, not for rehash */
+static __always_inline void
+conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key,
+	      bool new_hash, struct hlist_bl_head **head_ret)
 {
-	spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	struct hlist_bl_head *head;
+	u32 hash_key_new;
+
+	if (!new_hash) {
+		/* We need to lock the bucket in the right table */
+
+retry:
+		if (!ip_vs_rht_same_table(t, hash_key)) {
+			/* It is already moved to new table */
+			t = rcu_dereference(t->new_tbl);
+		}
+	}
+
+	head = t->buckets + (hash_key & t->mask);
+
+	local_bh_disable();
+	/* Do not touch seqcount, this is a safe operation */
+
+	hlist_bl_lock(head);
+	if (!new_hash) {
+		/* Ensure hash_key is read under lock */
+		hash_key_new = READ_ONCE(cp->hash_key);
+		/* Hash changed ? */
+		if (hash_key != hash_key_new) {
+			hlist_bl_unlock(head);
+			local_bh_enable();
+			hash_key = hash_key_new;
+			goto retry;
+		}
+	}
+	*head_ret = head;
 }
 
-static inline void ct_write_unlock_bh(unsigned int key)
+static inline void conn_tab_unlock(struct hlist_bl_head *head)
 {
-	spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	hlist_bl_unlock(head);
+	local_bh_enable();
 }
 
 static void ip_vs_conn_expire(struct timer_list *t);
@@ -98,30 +125,31 @@ static void ip_vs_conn_expire(struct timer_list *t);
 /*
  *	Returns hash value for IPVS connection entry
  */
-static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
-				       const union nf_inet_addr *addr,
-				       __be16 port)
+static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto,
+			      const union nf_inet_addr *addr, __be16 port)
 {
+	u64 a = (u32)proto << 16 | (__force u32)port;
+
 #ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
-		return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
-				    (__force u32)port, proto, ip_vs_conn_rnd) ^
-			((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
+	if (af == AF_INET6) {
+		u64 b = (u64)addr->all[0] << 32 | addr->all[1];
+		u64 c = (u64)addr->all[2] << 32 | addr->all[3];
+
+		return (u32)siphash_3u64(a, b, c, &t->hash_key);
+	}
 #endif
-	return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
-			    ip_vs_conn_rnd) ^
-		((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
+	a |= (u64)addr->all[0] << 32;
+	return (u32)siphash_1u64(a, &t->hash_key);
 }
 
 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
-					     bool inverse)
+					     struct ip_vs_rht *t, bool inverse)
 {
 	const union nf_inet_addr *addr;
 	__be16 port;
 
 	if (p->pe_data && p->pe->hashkey_raw)
-		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
-			ip_vs_conn_tab_mask;
+		return p->pe->hashkey_raw(p, t, inverse);
 
 	if (likely(!inverse)) {
 		addr = p->caddr;
@@ -131,10 +159,11 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
 		port = p->vport;
 	}
 
-	return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port);
+	return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port);
 }
 
-static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t,
+					    const struct ip_vs_conn *cp)
 {
 	struct ip_vs_conn_param p;
 
@@ -147,31 +176,36 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
 		p.pe_data_len = cp->pe_data_len;
 	}
 
-	return ip_vs_conn_hashkey_param(&p, false);
+	return ip_vs_conn_hashkey_param(&p, t, false);
 }
 
-/*
- *	Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
+/*	Hashes ip_vs_conn in conn_tab
  *	returns bool success.
  */
 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 {
-	unsigned int hash;
+	struct netns_ipvs *ipvs = cp->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t;
+	u32 hash_key;
 	int ret;
 
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return 0;
 
-	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey_conn(cp);
+	/* New entries go into recent table */
+	t = rcu_dereference(ipvs->conn_tab);
+	t = rcu_dereference(t->new_tbl);
 
-	ct_write_lock_bh(hash);
+	hash_key = ip_vs_rht_build_hash_key(t, ip_vs_conn_hashkey_conn(t, cp));
+	conn_tab_lock(t, cp, hash_key, true /* new_hash */, &head);
 	spin_lock(&cp->lock);
 
 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
 		cp->flags |= IP_VS_CONN_F_HASHED;
+		WRITE_ONCE(cp->hash_key, hash_key);
 		refcount_inc(&cp->refcnt);
-		hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
+		hlist_bl_add_head_rcu(&cp->c_list, head);
 		ret = 1;
 	} else {
 		pr_err("%s(): request for already hashed, called from %pS\n",
@@ -180,75 +214,58 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	}
 
 	spin_unlock(&cp->lock);
-	ct_write_unlock_bh(hash);
-
-	return ret;
-}
-
+	conn_tab_unlock(head);
 
-/*
- *	UNhashes ip_vs_conn from ip_vs_conn_tab.
- *	returns bool success. Caller should hold conn reference.
- */
-static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
-{
-	unsigned int hash;
-	int ret;
-
-	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey_conn(cp);
-
-	ct_write_lock_bh(hash);
-	spin_lock(&cp->lock);
-
-	if (cp->flags & IP_VS_CONN_F_HASHED) {
-		hlist_del_rcu(&cp->c_list);
-		cp->flags &= ~IP_VS_CONN_F_HASHED;
-		refcount_dec(&cp->refcnt);
-		ret = 1;
-	} else
-		ret = 0;
-
-	spin_unlock(&cp->lock);
-	ct_write_unlock_bh(hash);
+	/* Schedule resizing if load increases */
+	if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
+	    !test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags))
+		mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0);
 
 	return ret;
 }
 
-/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+/* Try to unlink ip_vs_conn from conn_tab.
  * returns bool success.
  */
 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 {
-	unsigned int hash;
+	struct netns_ipvs *ipvs = cp->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t;
 	bool ret = false;
+	u32 hash_key;
 
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return refcount_dec_if_one(&cp->refcnt);
 
-	hash = ip_vs_conn_hashkey_conn(cp);
+	rcu_read_lock();
+
+	t = rcu_dereference(ipvs->conn_tab);
+	hash_key = READ_ONCE(cp->hash_key);
 
-	ct_write_lock_bh(hash);
+	conn_tab_lock(t, cp, hash_key, false /* new_hash */, &head);
 	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
 		/* Decrease refcnt and unlink conn only if we are last user */
 		if (refcount_dec_if_one(&cp->refcnt)) {
-			hlist_del_rcu(&cp->c_list);
+			hlist_bl_del_rcu(&cp->c_list);
 			cp->flags &= ~IP_VS_CONN_F_HASHED;
 			ret = true;
 		}
 	}
 
 	spin_unlock(&cp->lock);
-	ct_write_unlock_bh(hash);
+	conn_tab_unlock(head);
+
+	rcu_read_unlock();
 
 	return ret;
 }
 
 
 /*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Gets ip_vs_conn associated with supplied parameters in the conn_tab.
  *  Called for pkts coming from OUTside-to-INside.
  *	p->caddr, p->cport: pkt source address (foreign host)
  *	p->vaddr, p->vport: pkt dest address (load balancer)
@@ -256,26 +273,38 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 static inline struct ip_vs_conn *
 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct netns_ipvs *ipvs = p->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *pt;
+	struct hlist_bl_node *e;
 	struct ip_vs_conn *cp;
-
-	hash = ip_vs_conn_hashkey_param(p, false);
+	u32 hash, hash_key;
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (p->cport == cp->cport && p->vport == cp->vport &&
-		    cp->af == p->af &&
-		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
-		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
-		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    p->protocol == cp->protocol &&
-		    cp->ipvs == p->ipvs) {
-			if (!__ip_vs_conn_get(cp))
-				continue;
-			/* HIT */
-			rcu_read_unlock();
-			return cp;
+	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+		hash = ip_vs_conn_hashkey_param(p, t, false);
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
+				if (READ_ONCE(cp->hash_key) == hash_key &&
+				    p->cport == cp->cport &&
+				    p->vport == cp->vport && cp->af == p->af &&
+				    ip_vs_addr_equal(p->af, p->caddr,
+						     &cp->caddr) &&
+				    ip_vs_addr_equal(p->af, p->vaddr,
+						     &cp->vaddr) &&
+				    (!p->cport ^
+				     (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+				    p->protocol == cp->protocol) {
+					if (__ip_vs_conn_get(cp)) {
+						/* HIT */
+						rcu_read_unlock();
+						return cp;
+					}
+				}
+			}
 		}
 	}
 
@@ -342,37 +371,50 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
 /* Get reference to connection template */
 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
-	unsigned int hash;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct netns_ipvs *ipvs = p->ipvs;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *pt;
+	struct hlist_bl_node *e;
 	struct ip_vs_conn *cp;
-
-	hash = ip_vs_conn_hashkey_param(p, false);
+	u32 hash, hash_key;
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (unlikely(p->pe_data && p->pe->ct_match)) {
-			if (cp->ipvs != p->ipvs)
-				continue;
-			if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
-				if (__ip_vs_conn_get(cp))
-					goto out;
+	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+		hash = ip_vs_conn_hashkey_param(p, t, false);
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
+				if (READ_ONCE(cp->hash_key) != hash_key)
+					continue;
+				if (unlikely(p->pe_data && p->pe->ct_match)) {
+					if (p->pe == cp->pe &&
+					    p->pe->ct_match(p, cp) &&
+					    __ip_vs_conn_get(cp))
+						goto out;
+					continue;
+				}
+				if (cp->af == p->af &&
+				    ip_vs_addr_equal(p->af, p->caddr,
+						     &cp->caddr) &&
+				    /* protocol should only be IPPROTO_IP if
+				     * p->vaddr is a fwmark
+				     */
+				    ip_vs_addr_equal(p->protocol == IPPROTO_IP ?
+						     AF_UNSPEC : p->af,
+						     p->vaddr, &cp->vaddr) &&
+				    p->vport == cp->vport &&
+				    p->cport == cp->cport &&
+				    cp->flags & IP_VS_CONN_F_TEMPLATE &&
+				    p->protocol == cp->protocol &&
+				    cp->dport != htons(0xffff)) {
+					if (__ip_vs_conn_get(cp))
+						goto out;
+				}
 			}
-			continue;
 		}
 
-		if (cp->af == p->af &&
-		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
-		    /* protocol should only be IPPROTO_IP if
-		     * p->vaddr is a fwmark */
-		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
-				     p->af, p->vaddr, &cp->vaddr) &&
-		    p->vport == cp->vport && p->cport == cp->cport &&
-		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    p->protocol == cp->protocol &&
-		    cp->ipvs == p->ipvs) {
-			if (__ip_vs_conn_get(cp))
-				goto out;
-		}
 	}
 	cp = NULL;
 
@@ -388,58 +430,64 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 	return cp;
 }
 
-/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+/* Gets ip_vs_conn associated with supplied parameters in the conn_tab.
  * Called for pkts coming from inside-to-OUTside.
  *	p->caddr, p->cport: pkt source address (inside host)
  *	p->vaddr, p->vport: pkt dest address (foreign host) */
 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 {
-	unsigned int hash;
-	struct ip_vs_conn *cp, *ret=NULL;
+	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+	struct netns_ipvs *ipvs = p->ipvs;
 	const union nf_inet_addr *saddr;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *pt;
+	struct hlist_bl_node *e;
+	struct ip_vs_conn *cp;
+	u32 hash, hash_key;
 	__be16 sport;
 
-	/*
-	 *	Check for "full" addressed entries
-	 */
-	hash = ip_vs_conn_hashkey_param(p, true);
-
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (p->vport != cp->cport)
-			continue;
+	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+		hash = ip_vs_conn_hashkey_param(p, t, true);
+		hash_key = ip_vs_rht_build_hash_key(t, hash);
+		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
+			hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
+				if (READ_ONCE(cp->hash_key) != hash_key ||
+				    p->vport != cp->cport)
+					continue;
 
-		if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
-			sport = cp->vport;
-			saddr = &cp->vaddr;
-		} else {
-			sport = cp->dport;
-			saddr = &cp->daddr;
-		}
+				if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+					sport = cp->vport;
+					saddr = &cp->vaddr;
+				} else {
+					sport = cp->dport;
+					saddr = &cp->daddr;
+				}
 
-		if (p->cport == sport && cp->af == p->af &&
-		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
-		    ip_vs_addr_equal(p->af, p->caddr, saddr) &&
-		    p->protocol == cp->protocol &&
-		    cp->ipvs == p->ipvs) {
-			if (!__ip_vs_conn_get(cp))
-				continue;
-			/* HIT */
-			ret = cp;
-			break;
+				if (p->cport == sport && cp->af == p->af &&
+				    ip_vs_addr_equal(p->af, p->vaddr,
+						     &cp->caddr) &&
+				    ip_vs_addr_equal(p->af, p->caddr, saddr) &&
+				    p->protocol == cp->protocol) {
+					if (__ip_vs_conn_get(cp))
+						goto out;
+				}
+			}
 		}
 	}
+	cp = NULL;
 
+out:
 	rcu_read_unlock();
 
 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
 		      ip_vs_proto_name(p->protocol),
 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
-		      ret ? "hit" : "not hit");
+		      cp ? "hit" : "not hit");
 
-	return ret;
+	return cp;
 }
 
 struct ip_vs_conn *
@@ -484,20 +532,260 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
  */
 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 {
-	if (ip_vs_conn_unhash(cp)) {
-		spin_lock_bh(&cp->lock);
-		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
-			atomic_dec(&ip_vs_conn_no_cport_cnt);
-			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
-			cp->cport = cport;
-		}
-		spin_unlock_bh(&cp->lock);
+	struct hlist_bl_head *head, *head2, *head_new;
+	struct netns_ipvs *ipvs = cp->ipvs;
+	u32 hash_r = 0, hash_key_r = 0;
+	struct ip_vs_rht *t, *tp, *t2;
+	u32 hash_key, hash_key_new;
+	struct ip_vs_conn_param p;
+	int ntbl;
+
+	ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr,
+			      cport, &cp->vaddr, cp->vport, &p);
+	ntbl = 0;
+
+	/* Attempt to rehash cp safely, by informing seqcount readers */
+	t = rcu_dereference(ipvs->conn_tab);
+	hash_key = READ_ONCE(cp->hash_key);
+	tp = NULL;
+
+retry:
+	/* Moved to new table ? */
+	if (!ip_vs_rht_same_table(t, hash_key)) {
+		t = rcu_dereference(t->new_tbl);
+		ntbl++;
+		/* We are lost? */
+		if (ntbl >= 2)
+			return;
+	}
+
+	/* Rehashing during resize? Use the recent table for adds */
+	t2 = rcu_dereference(t->new_tbl);
+	/* Calc new hash once per table */
+	if (tp != t2) {
+		hash_r = ip_vs_conn_hashkey_param(&p, t2, false);
+		hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r);
+		tp = t2;
+	}
+	head = t->buckets + (hash_key & t->mask);
+	head2 = t2->buckets + (hash_key_r & t2->mask);
+	head_new = head2;
 
-		/* hash on new dport */
-		ip_vs_conn_hash(cp);
+	if (head > head2 && t == t2)
+		swap(head, head2);
+
+	/* Lock seqcount only for the old bucket, even if we are on new table
+	 * because it affacts the del operation, not the adding.
+	 */
+	spin_lock_bh(&t->lock[hash_key & t->lock_mask].l);
+	preempt_disable_nested();
+	write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
+
+	/* Lock buckets in same (increasing) order */
+	hlist_bl_lock(head);
+	if (head != head2)
+		hlist_bl_lock(head2);
+
+	/* Ensure hash_key is read under lock */
+	hash_key_new = READ_ONCE(cp->hash_key);
+	/* Racing with another rehashing ? */
+	if (unlikely(hash_key != hash_key_new)) {
+		if (head != head2)
+			hlist_bl_unlock(head2);
+		hlist_bl_unlock(head);
+		write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
+		preempt_enable_nested();
+		spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+		hash_key = hash_key_new;
+		goto retry;
 	}
+
+	spin_lock(&cp->lock);
+	if ((cp->flags & IP_VS_CONN_F_NO_CPORT) &&
+	    (cp->flags & IP_VS_CONN_F_HASHED)) {
+		/* We do not recalc hash_key_r under lock, we assume the
+		 * parameters in cp do not change, i.e. cport is
+		 * the only possible change.
+		 */
+		WRITE_ONCE(cp->hash_key, hash_key_r);
+		if (head != head2) {
+			hlist_bl_del_rcu(&cp->c_list);
+			hlist_bl_add_head_rcu(&cp->c_list, head_new);
+		}
+		atomic_dec(&ip_vs_conn_no_cport_cnt);
+		cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+		cp->cport = cport;
+	}
+	spin_unlock(&cp->lock);
+
+	if (head != head2)
+		hlist_bl_unlock(head2);
+	hlist_bl_unlock(head);
+	write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
+	preempt_enable_nested();
+	spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+}
+
+/* Get default load factor to map conn_count/u_thresh to t->size */
+static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs)
+{
+	int factor;
+
+	if (net_eq(ipvs->net, &init_net))
+		factor = 3;
+	else
+		factor = 1;
+	return factor;
+}
+
+/* Get the desired conn_tab size */
+int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
+			    int lfactor)
+{
+	return ip_vs_rht_desired_size(ipvs, t, atomic_read(&ipvs->conn_count),
+				      lfactor, IP_VS_CONN_TAB_MIN_BITS,
+				      ip_vs_conn_tab_bits);
+}
+
+/* Allocate conn_tab */
+struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
+				       int lfactor)
+{
+	struct ip_vs_rht *t;
+	int scounts, locks;
+
+	/* scounts: affects readers during resize */
+	scounts = clamp(buckets >> 6, 1, 256);
+	/* locks: based on parallel IP_VS_CONN_F_NO_CPORT operations + resize */
+	locks = clamp(8, 1, scounts);
+
+	t = ip_vs_rht_alloc(buckets, scounts, locks);
+	if (!t)
+		return NULL;
+	t->lfactor = lfactor;
+	ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_CONN_TAB_MIN_BITS,
+				 ip_vs_conn_tab_bits);
+	return t;
 }
 
+/* conn_tab resizer work */
+static void conn_resize_work_handler(struct work_struct *work)
+{
+	struct hlist_bl_head *head, *head2;
+	unsigned int resched_score = 0;
+	struct hlist_bl_node *cn, *nn;
+	struct ip_vs_rht *t, *t_new;
+	struct netns_ipvs *ipvs;
+	struct ip_vs_conn *cp;
+	bool more_work = false;
+	u32 hash, hash_key;
+	int limit = 0;
+	int new_size;
+	int lfactor;
+	u32 bucket;
+
+	ipvs = container_of(work, struct netns_ipvs, conn_resize_work.work);
+
+	/* Allow work to be queued again */
+	clear_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags);
+	t = rcu_dereference_protected(ipvs->conn_tab, 1);
+	/* Do nothing if table is removed */
+	if (!t)
+		goto out;
+	/* New table needs to be registered? BUG! */
+	if (t != rcu_dereference_protected(t->new_tbl, 1))
+		goto out;
+
+	lfactor = sysctl_conn_lfactor(ipvs);
+	/* Should we resize ? */
+	new_size = ip_vs_conn_desired_size(ipvs, t, lfactor);
+	if (new_size == t->size && lfactor == t->lfactor)
+		goto out;
+
+	t_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
+	if (!t_new) {
+		more_work = true;
+		goto out;
+	}
+	/* Flip the table_id */
+	t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
+
+	rcu_assign_pointer(t->new_tbl, t_new);
+
+	/* Wait RCU readers to see the new table, we do not want new
+	 * conns to go into old table and to be left there.
+	 */
+	synchronize_rcu();
+
+	ip_vs_rht_for_each_bucket(t, bucket, head) {
+same_bucket:
+		if (++limit >= 16) {
+			if (resched_score >= 100) {
+				resched_score = 0;
+				cond_resched();
+			}
+			limit = 0;
+		}
+		if (hlist_bl_empty(head)) {
+			resched_score++;
+			continue;
+		}
+		/* Preemption calls ahead... */
+		resched_score = 0;
+
+		/* seqcount_t usage considering PREEMPT_RT rules:
+		 * - other writers (SoftIRQ) => serialize with spin_lock_bh
+		 * - readers (SoftIRQ) => disable BHs
+		 * - readers (processes) => preemption should be disabled
+		 */
+		spin_lock_bh(&t->lock[bucket & t->lock_mask].l);
+		preempt_disable_nested();
+		write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]);
+		hlist_bl_lock(head);
+
+		hlist_bl_for_each_entry_safe(cp, cn, nn, head, c_list) {
+			hash = ip_vs_conn_hashkey_conn(t_new, cp);
+			hash_key = ip_vs_rht_build_hash_key(t_new, hash);
+
+			head2 = t_new->buckets + (hash & t_new->mask);
+			hlist_bl_lock(head2);
+			/* t_new->seqc are not used at this stage, we race
+			 * only with add/del, so only lock the bucket.
+			 */
+			hlist_bl_del_rcu(&cp->c_list);
+			WRITE_ONCE(cp->hash_key, hash_key);
+			hlist_bl_add_head_rcu(&cp->c_list, head2);
+			hlist_bl_unlock(head2);
+			/* Too long chain? Do it in steps */
+			if (++limit >= 64)
+				break;
+		}
+
+		hlist_bl_unlock(head);
+		write_seqcount_end(&t->seqc[bucket & t->seqc_mask]);
+		preempt_enable_nested();
+		spin_unlock_bh(&t->lock[bucket & t->lock_mask].l);
+		if (limit >= 64)
+			goto same_bucket;
+	}
+
+	rcu_assign_pointer(ipvs->conn_tab, t_new);
+	/* Inform readers that new table is installed */
+	smp_mb__before_atomic();
+	atomic_inc(&ipvs->conn_tab_changes);
+
+	/* RCU readers should not see more than two tables in chain.
+	 * To prevent new table to be attached wait here instead of
+	 * freeing the old table in RCU callback.
+	 */
+	synchronize_rcu();
+	ip_vs_rht_free(t);
+
+out:
+	/* Monitor if we need to shrink table */
+	queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work,
+			   more_work ? 1 : 2 * HZ);
+}
 
 /*
  *	Bind a connection entry with the corresponding packet_xmit.
@@ -781,17 +1069,11 @@ int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
 			      IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
 			      ntohs(ct->dport));
 
-		/*
-		 * Invalidate the connection template
+		/* Invalidate the connection template. Prefer to avoid
+		 * rehashing, it will move it as first in chain, so use
+		 * only dport as indication, it is not a hash key.
 		 */
-		if (ct->vport != htons(0xffff)) {
-			if (ip_vs_conn_unhash(ct)) {
-				ct->dport = htons(0xffff);
-				ct->vport = htons(0xffff);
-				ct->cport = 0;
-				ip_vs_conn_hash(ct);
-			}
-		}
+		ct->dport = htons(0xffff);
 
 		/*
 		 * Simply decrease the refcnt of the template,
@@ -929,7 +1211,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 
 
 /*
- *	Create a new connection entry and hash it into the ip_vs_conn_tab
+ *	Create a new connection entry and hash it into the conn_tab
  */
 struct ip_vs_conn *
 ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
@@ -947,7 +1229,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 		return NULL;
 	}
 
-	INIT_HLIST_NODE(&cp->c_list);
+	INIT_HLIST_BL_NODE(&cp->c_list);
 	timer_setup(&cp->timer, ip_vs_conn_expire, 0);
 	cp->ipvs	   = ipvs;
 	cp->af		   = p->af;
@@ -1028,7 +1310,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 	if (ip_vs_conntrack_enabled(ipvs))
 		cp->flags |= IP_VS_CONN_F_NFCT;
 
-	/* Hash it in the ip_vs_conn_tab finally */
+	/* Hash it in the conn_tab finally */
 	ip_vs_conn_hash(cp);
 
 	return cp;
@@ -1040,26 +1322,41 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 #ifdef CONFIG_PROC_FS
 struct ip_vs_iter_state {
 	struct seq_net_private	p;
-	struct hlist_head	*l;
+	struct ip_vs_rht *t;
+	u32 bucket;
+	int gen;
 };
 
 static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 {
-	int idx;
-	struct ip_vs_conn *cp;
 	struct ip_vs_iter_state *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_rht *t = iter->t;
+	struct hlist_bl_node *e;
+	struct ip_vs_conn *cp;
+	int idx;
 
-	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+	if (!t)
+		return NULL;
+	for (idx = 0; idx < t->size; idx++) {
+		hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[idx], c_list) {
 			/* __ip_vs_conn_get() is not needed by
 			 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
 			 */
+			if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key)))
+				break;
 			if (pos-- == 0) {
-				iter->l = &ip_vs_conn_tab[idx];
+				iter->bucket = idx;
 				return cp;
 			}
 		}
-		cond_resched_rcu();
+		if (!(idx & 31)) {
+			cond_resched_rcu();
+			/* New table installed ? */
+			if (iter->gen != atomic_read(&ipvs->conn_tab_changes))
+				break;
+		}
 	}
 
 	return NULL;
@@ -1069,38 +1366,55 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
 	struct ip_vs_iter_state *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	iter->l = NULL;
 	rcu_read_lock();
+	iter->gen = atomic_read(&ipvs->conn_tab_changes);
+	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
+	iter->t = rcu_dereference(ipvs->conn_tab);
 	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
 }
 
 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct ip_vs_conn *cp = v;
 	struct ip_vs_iter_state *iter = seq->private;
-	struct hlist_node *e;
-	struct hlist_head *l = iter->l;
-	int idx;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_conn *cp = v;
+	struct hlist_bl_node *e;
+	struct ip_vs_rht *t;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
 		return ip_vs_conn_array(seq, 0);
 
+	t = iter->t;
+	if (!t)
+		return NULL;
+
 	/* more on same hash chain? */
-	e = rcu_dereference(hlist_next_rcu(&cp->c_list));
-	if (e)
-		return hlist_entry(e, struct ip_vs_conn, c_list);
-
-	idx = l - ip_vs_conn_tab;
-	while (++idx < ip_vs_conn_tab_size) {
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
-			iter->l = &ip_vs_conn_tab[idx];
+	hlist_bl_for_each_entry_continue_rcu(cp, e, c_list) {
+		/* Our cursor was moved to new table ? */
+		if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key)))
+			break;
+		return cp;
+	}
+
+	while (++iter->bucket < t->size) {
+		hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[iter->bucket],
+					    c_list) {
+			if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key)))
+				break;
 			return cp;
 		}
-		cond_resched_rcu();
+		if (!(iter->bucket & 31)) {
+			cond_resched_rcu();
+			/* New table installed ? */
+			if (iter->gen != atomic_read(&ipvs->conn_tab_changes))
+				break;
+		}
 	}
-	iter->l = NULL;
 	return NULL;
 }
 
@@ -1118,13 +1432,10 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
 		const struct ip_vs_conn *cp = v;
-		struct net *net = seq_file_net(seq);
 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
 		size_t len = 0;
 		char dbuf[IP_VS_ADDRSTRLEN];
 
-		if (!net_eq(cp->ipvs->net, net))
-			return 0;
 		if (cp->pe_data) {
 			pe_data[0] = ' ';
 			len = strlen(cp->pe->name);
@@ -1196,10 +1507,6 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
 	else {
 		const struct ip_vs_conn *cp = v;
-		struct net *net = seq_file_net(seq);
-
-		if (!net_eq(cp->ipvs->net, net))
-			return 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->daf == AF_INET6)
@@ -1291,22 +1598,29 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
 	return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
 }
 
-/* Called from keventd and must protect itself from softirqs */
 void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 {
-	int idx;
+	struct hlist_bl_node *e;
 	struct ip_vs_conn *cp;
+	struct ip_vs_rht *t;
+	unsigned int r;
+	int idx;
 
+	r = get_random_u32();
 	rcu_read_lock();
+	t = rcu_dereference(ipvs->conn_tab);
+	if (!t)
+		goto out;
 	/*
 	 * Randomly scan 1/32 of the whole table every second
 	 */
-	for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
-		unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
+	for (idx = 0; idx < (t->size >> 5); idx++) {
+		unsigned int hash = (r + idx) & t->mask;
 
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-			if (cp->ipvs != ipvs)
-				continue;
+		/* Don't care if due to moved entry we jump to another bucket
+		 * and even to new table
+		 */
+		hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[hash], c_list) {
 			if (atomic_read(&cp->n_control))
 				continue;
 			if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
@@ -1353,27 +1667,39 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 			IP_VS_DBG(4, "drop connection\n");
 			ip_vs_conn_del(cp);
 		}
-		cond_resched_rcu();
+		if (!(idx & 31)) {
+			cond_resched_rcu();
+			t = rcu_dereference(ipvs->conn_tab);
+			if (!t)
+				goto out;
+		}
 	}
+
+out:
 	rcu_read_unlock();
 }
 
 
-/*
- *      Flush all the connection entries in the ip_vs_conn_tab
- */
+/* Flush all the connection entries in the conn_tab */
 static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
 	struct ip_vs_conn *cp, *cp_c;
+	struct hlist_bl_head *head;
+	struct ip_vs_rht *t, *p;
+	struct hlist_bl_node *e;
+
+	if (!rcu_dereference_protected(ipvs->conn_tab, 1))
+		return;
+	cancel_delayed_work_sync(&ipvs->conn_resize_work);
+	if (!atomic_read(&ipvs->conn_count))
+		goto unreg;
 
 flush_again:
+	/* Rely on RCU grace period while accessing cp after ip_vs_conn_del */
 	rcu_read_lock();
-	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
-
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
-			if (cp->ipvs != ipvs)
-				continue;
+	ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) {
+		hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
 			if (atomic_read(&cp->n_control))
 				continue;
 			cp_c = cp->control;
@@ -1394,21 +1720,47 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
 		schedule();
 		goto flush_again;
 	}
+
+unreg:
+	/* Unregister the hash table and release it after RCU grace period.
+	 * This is needed because other works may not be stopped yet and
+	 * they may walk the tables.
+	 */
+	t = rcu_dereference_protected(ipvs->conn_tab, 1);
+	rcu_assign_pointer(ipvs->conn_tab, NULL);
+	/* Inform readers that conn_tab is changed */
+	smp_mb__before_atomic();
+	atomic_inc(&ipvs->conn_tab_changes);
+	while (1) {
+		p = rcu_dereference_protected(t->new_tbl, 1);
+		call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
+		if (p == t)
+			break;
+		t = p;
+	}
 }
 
 #ifdef CONFIG_SYSCTL
 void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
 {
-	int idx;
+	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
+	unsigned int resched_score = 0;
 	struct ip_vs_conn *cp, *cp_c;
+	struct hlist_bl_head *head;
 	struct ip_vs_dest *dest;
+	struct hlist_bl_node *e;
+	int old_gen, new_gen;
 
+	if (!atomic_read(&ipvs->conn_count))
+		return;
+	old_gen = atomic_read(&ipvs->conn_tab_changes);
 	rcu_read_lock();
-	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
-		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
-			if (cp->ipvs != ipvs)
-				continue;
 
+repeat:
+	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
+	ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) {
+		hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
+			resched_score++;
 			dest = cp->dest;
 			if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
 				continue;
@@ -1423,13 +1775,25 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
 				IP_VS_DBG(4, "del controlling connection\n");
 				ip_vs_conn_del(cp_c);
 			}
+			resched_score += 10;
+		}
+		resched_score++;
+		if (resched_score >= 100) {
+			resched_score = 0;
+			cond_resched_rcu();
+			/* netns clean up started, abort delayed work */
+			if (!ipvs->enable)
+				goto out;
+			new_gen = atomic_read(&ipvs->conn_tab_changes);
+			/* New table installed ? */
+			if (old_gen != new_gen) {
+				old_gen = new_gen;
+				goto repeat;
+			}
 		}
-		cond_resched_rcu();
-
-		/* netns clean up started, abort delayed work */
-		if (!ipvs->enable)
-			break;
 	}
+
+out:
 	rcu_read_unlock();
 }
 #endif
@@ -1440,6 +1804,10 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
 int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
 {
 	atomic_set(&ipvs->conn_count, 0);
+	INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler);
+	RCU_INIT_POINTER(ipvs->conn_tab, NULL);
+	atomic_set(&ipvs->conn_tab_changes, 0);
+	ipvs->sysctl_conn_lfactor = ip_vs_conn_default_load_factor(ipvs);
 
 #ifdef CONFIG_PROC_FS
 	if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
@@ -1475,59 +1843,38 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
 
 int __init ip_vs_conn_init(void)
 {
+	int min = IP_VS_CONN_TAB_MIN_BITS;
+	int max = IP_VS_CONN_TAB_MAX_BITS;
 	size_t tab_array_size;
 	int max_avail;
-#if BITS_PER_LONG > 32
-	int max = 27;
-#else
-	int max = 20;
-#endif
-	int min = 8;
-	int idx;
 
 	max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
-	max_avail -= 2;		/* ~4 in hash row */
+	/* 64-bit: 27 bits at 64GB, 32-bit: 20 bits at 512MB */
+	max_avail += 1;		/* hash table loaded at 50% */
 	max_avail -= 1;		/* IPVS up to 1/2 of mem */
 	max_avail -= order_base_2(sizeof(struct ip_vs_conn));
 	max = clamp(max, min, max_avail);
 	ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max);
 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
-	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
 
 	/*
 	 * Allocate the connection hash table and initialize its list heads
 	 */
 	tab_array_size = array_size(ip_vs_conn_tab_size,
-				    sizeof(*ip_vs_conn_tab));
-	ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size,
-					sizeof(*ip_vs_conn_tab), GFP_KERNEL);
-	if (!ip_vs_conn_tab)
-		return -ENOMEM;
+				    sizeof(struct hlist_bl_head));
 
 	/* Allocate ip_vs_conn slab cache */
 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
 					      sizeof(struct ip_vs_conn), 0,
 					      SLAB_HWCACHE_ALIGN, NULL);
-	if (!ip_vs_conn_cachep) {
-		kvfree(ip_vs_conn_tab);
+	if (!ip_vs_conn_cachep)
 		return -ENOMEM;
-	}
 
 	pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n",
 		ip_vs_conn_tab_size, tab_array_size / 1024);
 	IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
 		  sizeof(struct ip_vs_conn));
 
-	for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
-		INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
-
-	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
-		spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
-	}
-
-	/* calculate the random value for connection hash */
-	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
-
 	return 0;
 }
 
@@ -1537,5 +1884,4 @@ void ip_vs_conn_cleanup(void)
 	rcu_barrier();
 	/* Release the empty cache */
 	kmem_cache_destroy(ip_vs_conn_cachep);
-	kvfree(ip_vs_conn_tab);
 }
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e2429fcc0bc3..72f39e3b4a7c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1642,6 +1642,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		  struct ip_vs_service **svc_p)
 {
 	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_rht *tc_new = NULL;
 	struct ip_vs_rht *t, *t_new = NULL;
 	int af_id = ip_vs_af_index(u->af);
 	struct ip_vs_service *svc = NULL;
@@ -1701,6 +1702,17 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		}
 	}
 
+	if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
+		int lfactor = sysctl_conn_lfactor(ipvs);
+		int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);
+
+		tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
+		if (!tc_new) {
+			ret = -ENOMEM;
+			goto out_err;
+		}
+	}
+
 	if (!atomic_read(&ipvs->num_services[af_id])) {
 		ret = ip_vs_register_hooks(ipvs, u->af);
 		if (ret < 0)
@@ -1755,6 +1767,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		rcu_assign_pointer(ipvs->svc_table, t_new);
 		t_new = NULL;
 	}
+	if (tc_new) {
+		rcu_assign_pointer(ipvs->conn_tab, tc_new);
+		tc_new = NULL;
+	}
 
 	/* Hash the service into the service table */
 	ip_vs_svc_hash(svc);
@@ -1793,6 +1809,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 
 
  out_err:
+	if (tc_new)
+		ip_vs_rht_free(tc_new);
 	if (t_new)
 		ip_vs_rht_free(t_new);
 	if (ret_hooks >= 0)
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index e4ce1d9a63f9..fc419aa1dc3f 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -133,9 +133,9 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
 }
 
 static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
-				 u32 initval, bool inverse)
+				 struct ip_vs_rht *t, bool inverse)
 {
-	return jhash(p->pe_data, p->pe_data_len, initval);
+	return jhash(p->pe_data, p->pe_data_len, (u32)t->hash_key.key[0]);
 }
 
 static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index be74c0906dda..6e0d2904e24a 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1756,6 +1756,28 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	if (!ip_vs_use_count_inc())
 		return -ENOPROTOOPT;
 
+	/* Backup server can be started without services just to sync conns,
+	 * make sure conn_tab is created even if ipvs->enable is 0.
+	 */
+	if (state == IP_VS_STATE_BACKUP) {
+		mutex_lock(&ipvs->service_mutex);
+		if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
+			int lfactor = sysctl_conn_lfactor(ipvs);
+			int new_size = ip_vs_conn_desired_size(ipvs, NULL,
+							       lfactor);
+			struct ip_vs_rht *tc_new;
+
+			tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
+			if (!tc_new) {
+				mutex_unlock(&ipvs->service_mutex);
+				result = -ENOMEM;
+				goto out_module;
+			}
+			rcu_assign_pointer(ipvs->conn_tab, tc_new);
+		}
+		mutex_unlock(&ipvs->service_mutex);
+	}
+
 	/* Do not hold one mutex and then to block on another */
 	for (;;) {
 		rtnl_lock();
@@ -1924,6 +1946,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	mutex_unlock(&ipvs->sync_mutex);
 	rtnl_unlock();
 
+out_module:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 	return result;

From patchwork Tue Dec 12 16:24:40 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489587
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="rQ8Jq1JQ"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 782BEE9;
	Tue, 12 Dec 2023 08:30:16 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 141F01E08A;
	Tue, 12 Dec 2023 18:30:15 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id F01D31E089;
	Tue, 12 Dec 2023 18:30:14 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id CC5EF3C07C9;
	Tue, 12 Dec 2023 18:30:11 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398611; bh=7vYJIEBYbXwNTowtsTf1SyuA7gNREGa0CNB2W84U2gg=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=rQ8Jq1JQ+M4aAKHOVhfCkELkBqh2SA824k/YEStm2Yz3bMDsir6qixXqNYcDqTS3M
	 Zn+D3EUYtlOHqUYguFQ+hOJJjJzSqny4n/3hed+3cG+g9awRdAURducrGzWrlWcgZ2
	 rYUV7DazGrQAtmy2h3bMQrgeAVHxb3qk3BzwElow=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQVAs094088;
	Tue, 12 Dec 2023 18:26:31 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQVEq094087;
	Tue, 12 Dec 2023 18:26:31 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 10/14] ipvs: show the current conn_tab size to
 users
Date: Tue, 12 Dec 2023 18:24:40 +0200
Message-ID: <20231212162444.93801-11-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

As conn_tab is per-net, better to show the current hash table size
to users instead of the ip_vs_conn_tab_size (max).

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 36 +++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 72f39e3b4a7c..2cc51e1c6328 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2741,10 +2741,16 @@ static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
 
 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 {
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	if (v == SEQ_START_TOKEN) {
+		struct ip_vs_rht *tc = rcu_dereference(ipvs->conn_tab);
+		int csize = tc ? tc->size : 0;
+
 		seq_printf(seq,
 			"IP Virtual Server version %d.%d.%d (size=%d)\n",
-			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+			NVERSION(IP_VS_VERSION_CODE), csize);
 		seq_puts(seq,
 			 "Prot LocalAddress:Port Scheduler Flags\n");
 		seq_puts(seq,
@@ -3422,10 +3428,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	switch (cmd) {
 	case IP_VS_SO_GET_VERSION:
 	{
+		struct ip_vs_rht *t;
+		int csize = 0;
 		char buf[64];
 
+		rcu_read_lock();
+		t = rcu_dereference(ipvs->conn_tab);
+		if (t)
+			csize = t->size;
+		rcu_read_unlock();
 		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
-			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+			NVERSION(IP_VS_VERSION_CODE), csize);
 		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
 			ret = -EFAULT;
 			goto out;
@@ -3437,8 +3450,16 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	case IP_VS_SO_GET_INFO:
 	{
 		struct ip_vs_getinfo info;
+		struct ip_vs_rht *t;
+		int csize = 0;
+
+		rcu_read_lock();
+		t = rcu_dereference(ipvs->conn_tab);
+		if (t)
+			csize = t->size;
+		rcu_read_unlock();
 		info.version = IP_VS_VERSION_CODE;
-		info.size = ip_vs_conn_tab_size;
+		info.size = csize;
 		info.num_services =
 			atomic_read(&ipvs->num_services[IP_VS_AF_INET]);
 		if (copy_to_user(user, &info, sizeof(info)) != 0)
@@ -4380,6 +4401,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	int ret, cmd, reply_cmd;
 	struct net *net = sock_net(skb->sk);
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_rht *t;
+	int csize;
 
 	cmd = info->genlhdr->cmd;
 
@@ -4447,10 +4470,13 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	case IPVS_CMD_GET_INFO:
+		csize = 0;
+		t = rcu_dereference(ipvs->conn_tab);
+		if (t)
+			csize = t->size;
 		if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
 				IP_VS_VERSION_CODE) ||
-		    nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
-				ip_vs_conn_tab_size))
+		    nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, csize))
 			goto nla_put_failure;
 		break;
 	}

From patchwork Tue Dec 12 16:24:41 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489593
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="vgrkCQ/i"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 86010A8;
	Tue, 12 Dec 2023 08:30:40 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 1D0571DE73;
	Tue, 12 Dec 2023 18:30:39 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 044E21E138;
	Tue, 12 Dec 2023 18:30:39 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id BDB323C07D5;
	Tue, 12 Dec 2023 18:30:12 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398612; bh=4+9diIoqsLBAliroC+nh/t+Of6r9po5HgWTXGNBUv7w=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=vgrkCQ/ijujvdC3cvIp2a5gZe+KVwH240AYHG82JYICaC+1wULT6NzK6cs/xSZand
	 PhIdK8qFW8rbeXRcEZggX3PTmr3uCsQ5rqJcC/G1MzfIYlG3WylxcEBCzyv0Z6eVDy
	 CPtxH8Jr4B62ThyMlG1g37AMqhmpa97JRMKWDuX8=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQVfK094093;
	Tue, 12 Dec 2023 18:26:31 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQVcF094091;
	Tue, 12 Dec 2023 18:26:31 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 11/14] ipvs: no_cport and dropentry counters
 can be per-net
Date: Tue, 12 Dec 2023 18:24:41 +0200
Message-ID: <20231212162444.93801-12-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

With using per-net conn_tab these counters do not need to be
global anymore.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h             |  2 ++
 net/netfilter/ipvs/ip_vs_conn.c | 62 ++++++++++++++++++++-------------
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index f5605c289bba..ed74e4e36f21 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1158,6 +1158,7 @@ struct netns_ipvs {
 #endif
 	/* ip_vs_conn */
 	atomic_t		conn_count;      /* connection counter */
+	atomic_t		no_cport_conns[IP_VS_AF_MAX];
 	struct delayed_work	conn_resize_work;/* resize conn_tab */
 
 	/* ip_vs_ctl */
@@ -1188,6 +1189,7 @@ struct netns_ipvs {
 	int			drop_counter;
 	int			old_secure_tcp;
 	atomic_t		dropentry;
+	s8			dropentry_counters[8];
 	/* locks in ctl.c */
 	spinlock_t		dropentry_lock;  /* drop entry handling */
 	spinlock_t		droppacket_lock; /* drop packet handling */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 4c03736a602a..a4c1f78687fb 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -53,9 +53,6 @@ int ip_vs_conn_tab_size __read_mostly;
 /*  SLAB cache for IPVS connections */
 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
 
-/*  counter for no client port connections */
-static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
-
 /* We need an addrstrlen that works with or without v6 */
 #ifdef CONFIG_IP_VS_IPV6
 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
@@ -318,10 +315,16 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 	struct ip_vs_conn *cp;
 
 	cp = __ip_vs_conn_in_get(p);
-	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
-		struct ip_vs_conn_param cport_zero_p = *p;
-		cport_zero_p.cport = 0;
-		cp = __ip_vs_conn_in_get(&cport_zero_p);
+	if (!cp) {
+		struct netns_ipvs *ipvs = p->ipvs;
+		int af_id = ip_vs_af_index(p->af);
+
+		if (atomic_read(&ipvs->no_cport_conns[af_id])) {
+			struct ip_vs_conn_param cport_zero_p = *p;
+
+			cport_zero_p.cport = 0;
+			cp = __ip_vs_conn_in_get(&cport_zero_p);
+		}
 	}
 
 	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
@@ -534,6 +537,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 {
 	struct hlist_bl_head *head, *head2, *head_new;
 	struct netns_ipvs *ipvs = cp->ipvs;
+	int af_id = ip_vs_af_index(cp->af);
 	u32 hash_r = 0, hash_key_r = 0;
 	struct ip_vs_rht *t, *tp, *t2;
 	u32 hash_key, hash_key_new;
@@ -612,7 +616,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 			hlist_bl_del_rcu(&cp->c_list);
 			hlist_bl_add_head_rcu(&cp->c_list, head_new);
 		}
-		atomic_dec(&ip_vs_conn_no_cport_cnt);
+		atomic_dec(&ipvs->no_cport_conns[af_id]);
 		cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
 		cp->cport = cport;
 	}
@@ -1168,8 +1172,11 @@ static void ip_vs_conn_expire(struct timer_list *t)
 		if (unlikely(cp->app != NULL))
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
-		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
-			atomic_dec(&ip_vs_conn_no_cport_cnt);
+		if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+			int af_id = ip_vs_af_index(cp->af);
+
+			atomic_dec(&ipvs->no_cport_conns[af_id]);
+		}
 		if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 			ip_vs_conn_rcu_free(&cp->rcu_head);
 		else
@@ -1276,8 +1283,11 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 	cp->out_seq.delta = 0;
 
 	atomic_inc(&ipvs->conn_count);
-	if (flags & IP_VS_CONN_F_NO_CPORT)
-		atomic_inc(&ip_vs_conn_no_cport_cnt);
+	if (unlikely(flags & IP_VS_CONN_F_NO_CPORT)) {
+		int af_id = ip_vs_af_index(cp->af);
+
+		atomic_inc(&ipvs->no_cport_conns[af_id]);
+	}
 
 	/* Bind the connection with a destination server */
 	cp->dest = NULL;
@@ -1553,6 +1563,7 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
 };
 #endif
 
+#ifdef CONFIG_SYSCTL
 
 /* Randomly drop connection entries before running out of memory
  * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
@@ -1562,12 +1573,7 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
  */
 static inline int todrop_entry(struct ip_vs_conn *cp)
 {
-	/*
-	 * The drop rate array needs tuning for real environments.
-	 * Called from timer bh only => no locking
-	 */
-	static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
-	static signed char todrop_counter[9] = {0};
+	struct netns_ipvs *ipvs = cp->ipvs;
 	int i;
 
 	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
@@ -1576,15 +1582,17 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
 	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
 		return 0;
 
-	/* Don't drop the entry if its number of incoming packets is not
-	   located in [0, 8] */
+	/* Drop only conns with number of incoming packets in [1..8] range */
 	i = atomic_read(&cp->in_pkts);
-	if (i > 8 || i < 0) return 0;
+	if (i > 8 || i < 1)
+		return 0;
 
-	if (!todrop_rate[i]) return 0;
-	if (--todrop_counter[i] > 0) return 0;
+	i--;
+	if (--ipvs->dropentry_counters[i] > 0)
+		return 0;
 
-	todrop_counter[i] = todrop_rate[i];
+	/* Prefer to drop conns with less number of incoming packets */
+	ipvs->dropentry_counters[i] = i + 1;
 	return 1;
 }
 
@@ -1678,7 +1686,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 out:
 	rcu_read_unlock();
 }
-
+#endif
 
 /* Flush all the connection entries in the conn_tab */
 static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
@@ -1803,7 +1811,11 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
  */
 int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
 {
+	int idx;
+
 	atomic_set(&ipvs->conn_count, 0);
+	for (idx = 0; idx < IP_VS_AF_MAX; idx++)
+		atomic_set(&ipvs->no_cport_conns[idx], 0);
 	INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler);
 	RCU_INIT_POINTER(ipvs->conn_tab, NULL);
 	atomic_set(&ipvs->conn_tab_changes, 0);

From patchwork Tue Dec 12 16:24:42 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489598
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="WjrYObcb"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9F7E5CA;
	Tue, 12 Dec 2023 08:31:00 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 462AB1E21C;
	Tue, 12 Dec 2023 18:30:59 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 2C6681E177;
	Tue, 12 Dec 2023 18:30:59 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 8D5333C07DA;
	Tue, 12 Dec 2023 18:30:13 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398613; bh=vskMWWNXqFuWMM43rhop65ewGZh6MrUiEpS1Si22JnI=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=WjrYObcbeUk8lsUOrK3t76kR5ZBNS7Ic4iejRvkweysAB79ptuEVVufiXgZvfcQzW
	 LlEyABeTk3gPR2TOwPObadKTWIu1TbnjmAg7chQLjrFYWaVFOcX8yEI4z9GMyDel6g
	 TGMzHXzuj7vSNVw+54gdXM4C+PD1KlTiYVNEhOqg=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQVvR094099;
	Tue, 12 Dec 2023 18:26:31 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQVfZ094096;
	Tue, 12 Dec 2023 18:26:31 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 12/14] ipvs: use more keys for connection
 hashing
Date: Tue, 12 Dec 2023 18:24:42 +0200
Message-ID: <20231212162444.93801-13-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Simon Kirby reported long time ago that IPVS connection hashing
based only on the client address/port (caddr, cport) as hash keys
is not suitable for setups that accept traffic on multiple virtual
IPs and ports. It can happen for multiple VIP:VPORT services, for
single or many fwmark service(s) that match multiple virtual IPs
and ports or even for passive FTP with peristence in DR/TUN mode
where we expect traffic on multiple ports for the virtual IP.

Fix it by adding virtual addresses and ports to the hash function.
This causes the traffic from NAT real servers to clients to use
second hashing for the in->out direction.

As result:

- the IN direction from client will use hash node hn0 where
the source/dest addresses and ports used by client will be used
as hash keys

- the OUT direction from NAT real servers will use hash node hn1
for the traffic from real server to client

- the persistence templates are hashed only with parameters based on
the IN direction, so they now will also use the virtual address,
port and fwmark from the service.

OLD:
- all methods: c_list node: proto, caddr:cport
- persistence templates: c_list node: proto, caddr_net:0
- persistence engine templates: c_list node: per-PE, PE-SIP uses jhash

NEW:
- all methods: hn0 node (dir 0): proto, caddr:cport -> vaddr:vport
- MASQ method: hn1 node (dir 1): proto, daddr:dport -> caddr:cport
- persistence templates: hn0 node (dir 0):
  proto, caddr_net:0 -> vaddr:vport_or_0
  proto, caddr_net:0 -> fwmark:0
- persistence engine templates: hn0 node (dir 0): as before

Also reorder the ip_vs_conn fields, so that hash nodes are on same
read-mostly cache line while write-mostly fields are on separate
cache line.

Reported-by: Simon Kirby <sim@hostway.ca>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_vs.h             | 104 ++++++++----
 net/netfilter/ipvs/ip_vs_conn.c | 279 ++++++++++++++++++++++++--------
 2 files changed, 279 insertions(+), 104 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ed74e4e36f21..5eb11ef9601a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -785,51 +785,48 @@ struct ip_vs_conn_param {
 	__u8				pe_data_len;
 };
 
+/* Hash node in conn_tab */
+struct ip_vs_conn_hnode {
+	struct hlist_bl_node	node;		/* node in conn_tab */
+	u32			hash_key;	/* Key for the hash table */
+	u8			dir;		/* 0=out->in, 1=in->out */
+} __packed;
+
 /* IP_VS structure allocated for each dynamically scheduled connection */
 struct ip_vs_conn {
-	struct hlist_bl_node	c_list;         /* node in conn_tab */
-	__u32			hash_key;	/* Key for the hash table */
-	/* Protocol, addresses and port numbers */
+	/* Cacheline for hash table nodes - rarely modified */
+
+	struct ip_vs_conn_hnode	hn0;		/* Original direction */
+	u8			af;		/* address family */
 	__be16                  cport;
+	struct ip_vs_conn_hnode	hn1;		/* Reply direction */
+	u8			daf;		/* Address family of the dest */
 	__be16                  dport;
-	__be16                  vport;
-	u16			af;		/* address family */
-	__u16                   protocol;       /* Which protocol (TCP/UDP) */
-	__u16			daf;		/* Address family of the dest */
-	union nf_inet_addr      caddr;          /* client address */
-	union nf_inet_addr      vaddr;          /* virtual address */
-	union nf_inet_addr      daddr;          /* destination address */
+	struct ip_vs_dest       *dest;          /* real server */
+	atomic_t                n_control;      /* Number of controlled ones */
 	volatile __u32          flags;          /* status flags */
-	struct netns_ipvs	*ipvs;
-
-	/* counter and timer */
-	refcount_t		refcnt;		/* reference count */
-	struct timer_list	timer;		/* Expiration timer */
-	volatile unsigned long	timeout;	/* timeout */
+	/* 44/64 */
 
-	/* Flags and state transition */
-	spinlock_t              lock;           /* lock for state transition */
+	struct ip_vs_conn       *control;       /* Master control connection */
+	const struct ip_vs_pe	*pe;
+	char			*pe_data;
+	__u8			pe_data_len;
 	volatile __u16          state;          /* state info */
 	volatile __u16          old_state;      /* old state, to be used for
 						 * state transition triggered
 						 * synchronization
 						 */
-	__u32			fwmark;		/* Fire wall mark from skb */
-	unsigned long		sync_endtime;	/* jiffies + sent_retries */
+	/* 2-byte hole */
+	/* 64/96 */
 
-	/* Control members */
-	struct ip_vs_conn       *control;       /* Master control connection */
-	atomic_t                n_control;      /* Number of controlled ones */
-	struct ip_vs_dest       *dest;          /* real server */
-	atomic_t                in_pkts;        /* incoming packet counter */
+	union nf_inet_addr      caddr;          /* client address */
+	union nf_inet_addr      vaddr;          /* virtual address */
+	/* 96/128 */
 
-	/* Packet transmitter for different forwarding methods.  If it
-	 * mangles the packet, it must return NF_DROP or better NF_STOLEN,
-	 * otherwise this must be changed to a sk_buff **.
-	 * NF_ACCEPT can be returned when destination is local.
-	 */
-	int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
-			   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+	union nf_inet_addr      daddr;          /* destination address */
+	__u32			fwmark;		/* Fire wall mark from skb */
+	__be16                  vport;
+	__u16                   protocol;       /* Which protocol (TCP/UDP) */
 
 	/* Note: we can group the following members into a structure,
 	 * in order to save more space, and the following members are
@@ -837,14 +834,31 @@ struct ip_vs_conn {
 	 */
 	struct ip_vs_app        *app;           /* bound ip_vs_app object */
 	void                    *app_data;      /* Application private data */
+	/* 128/168 */
 	struct_group(sync_conn_opt,
 		struct ip_vs_seq  in_seq;       /* incoming seq. struct */
 		struct ip_vs_seq  out_seq;      /* outgoing seq. struct */
 	);
+	/* 152/192 */
 
-	const struct ip_vs_pe	*pe;
-	char			*pe_data;
-	__u8			pe_data_len;
+	struct timer_list	timer;		/* Expiration timer */
+	volatile unsigned long	timeout;	/* timeout */
+	spinlock_t              lock;           /* lock for state transition */
+	refcount_t		refcnt;		/* reference count */
+	atomic_t                in_pkts;        /* incoming packet counter */
+	/* 64-bit: 4-byte gap */
+
+	/* 188/256 */
+	unsigned long		sync_endtime;	/* jiffies + sent_retries */
+	struct netns_ipvs	*ipvs;
+
+	/* Packet transmitter for different forwarding methods.  If it
+	 * mangles the packet, it must return NF_DROP or better NF_STOLEN,
+	 * otherwise this must be changed to a sk_buff **.
+	 * NF_ACCEPT can be returned when destination is local.
+	 */
+	int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
+			   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
 
 	struct rcu_head		rcu_head;
 };
@@ -1614,6 +1628,19 @@ int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
 struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
 				       int lfactor);
 
+static inline struct ip_vs_conn *
+ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn)
+{
+	return container_of(hn, struct ip_vs_conn, hn0);
+}
+
+static inline struct ip_vs_conn *
+ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn)
+{
+	return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) :
+			 container_of(hn, struct ip_vs_conn, hn0);
+}
+
 struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 				  const union nf_inet_addr *daddr,
 				  __be16 dport, unsigned int flags,
@@ -1966,6 +1993,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp)
 	return fwd;
 }
 
+/* Check if connection uses double hashing */
+static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp)
+{
+	return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ &&
+	       !(cp->flags & IP_VS_CONN_F_TEMPLATE);
+}
+
 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		    struct ip_vs_conn *cp, int dir);
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index a4c1f78687fb..e2fba5559df6 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -76,11 +76,19 @@ static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
 /* Lock conn_tab bucket for conn hash/unhash, not for rehash */
 static __always_inline void
 conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key,
-	      bool new_hash, struct hlist_bl_head **head_ret)
+	      u32 hash_key2, bool use2, bool new_hash,
+	      struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret)
 {
-	struct hlist_bl_head *head;
-	u32 hash_key_new;
+	struct hlist_bl_head *head, *head2;
+	u32 hash_key_new, hash_key_new2;
+	struct ip_vs_rht *t2 = t;
+	u32 idx, idx2;
 
+	idx = hash_key & t->mask;
+	if (use2)
+		idx2 = hash_key2 & t->mask;
+	else
+		idx2 = idx;
 	if (!new_hash) {
 		/* We need to lock the bucket in the right table */
 
@@ -88,31 +96,64 @@ conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key,
 		if (!ip_vs_rht_same_table(t, hash_key)) {
 			/* It is already moved to new table */
 			t = rcu_dereference(t->new_tbl);
+			/* Rehashing works in two steps and we may detect
+			 * both nodes in different tables, use idx/idx2
+			 * for proper lock ordering for heads.
+			 */
+			idx = hash_key & t->mask;
+			idx |= IP_VS_RHT_TABLE_ID_MASK;
+		}
+		if (use2) {
+			if (!ip_vs_rht_same_table(t2, hash_key2)) {
+				/* It is already moved to new table */
+				t2 = rcu_dereference(t2->new_tbl);
+				idx2 = hash_key2 & t2->mask;
+				idx2 |= IP_VS_RHT_TABLE_ID_MASK;
+			}
+		} else {
+			idx2 = idx;
 		}
 	}
 
 	head = t->buckets + (hash_key & t->mask);
+	head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head;
 
 	local_bh_disable();
 	/* Do not touch seqcount, this is a safe operation */
 
-	hlist_bl_lock(head);
+	if (idx <= idx2) {
+		hlist_bl_lock(head);
+		if (head != head2)
+			hlist_bl_lock(head2);
+	} else {
+		hlist_bl_lock(head2);
+		hlist_bl_lock(head);
+	}
 	if (!new_hash) {
 		/* Ensure hash_key is read under lock */
-		hash_key_new = READ_ONCE(cp->hash_key);
+		hash_key_new = READ_ONCE(cp->hn0.hash_key);
+		hash_key_new2 = READ_ONCE(cp->hn1.hash_key);
 		/* Hash changed ? */
-		if (hash_key != hash_key_new) {
+		if (hash_key != hash_key_new ||
+		    (hash_key2 != hash_key_new2 && use2)) {
+			if (head != head2)
+				hlist_bl_unlock(head2);
 			hlist_bl_unlock(head);
 			local_bh_enable();
 			hash_key = hash_key_new;
+			hash_key2 = hash_key_new2;
 			goto retry;
 		}
 	}
 	*head_ret = head;
+	*head2_ret = head2;
 }
 
-static inline void conn_tab_unlock(struct hlist_bl_head *head)
+static inline void conn_tab_unlock(struct hlist_bl_head *head,
+				   struct hlist_bl_head *head2)
 {
+	if (head != head2)
+		hlist_bl_unlock(head2);
 	hlist_bl_unlock(head);
 	local_bh_enable();
 }
@@ -123,26 +164,34 @@ static void ip_vs_conn_expire(struct timer_list *t);
  *	Returns hash value for IPVS connection entry
  */
 static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto,
-			      const union nf_inet_addr *addr, __be16 port)
+			      const union nf_inet_addr *addr, __be16 port,
+			      const union nf_inet_addr *laddr, __be16 lport)
 {
 	u64 a = (u32)proto << 16 | (__force u32)port;
+	u64 d;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
 		u64 b = (u64)addr->all[0] << 32 | addr->all[1];
 		u64 c = (u64)addr->all[2] << 32 | addr->all[3];
 
-		return (u32)siphash_3u64(a, b, c, &t->hash_key);
+		a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport;
+		c ^= laddr->all[1];
+		d = (u64)laddr->all[0] << 32 | laddr->all[3];
+		return (u32)siphash_4u64(a, b, c, d, &t->hash_key);
 	}
 #endif
 	a |= (u64)addr->all[0] << 32;
-	return (u32)siphash_1u64(a, &t->hash_key);
+	d = (u64)laddr->all[0] << 32 | (__force u32)lport;
+	return (u32)siphash_2u64(a, d, &t->hash_key);
 }
 
 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
 					     struct ip_vs_rht *t, bool inverse)
 {
+	const union nf_inet_addr *laddr;
 	const union nf_inet_addr *addr;
+	__be16 lport;
 	__be16 port;
 
 	if (p->pe_data && p->pe->hashkey_raw)
@@ -151,21 +200,33 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
 	if (likely(!inverse)) {
 		addr = p->caddr;
 		port = p->cport;
+		laddr = p->vaddr;
+		lport = p->vport;
 	} else {
 		addr = p->vaddr;
 		port = p->vport;
+		laddr = p->caddr;
+		lport = p->cport;
 	}
 
-	return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port);
+	return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr,
+				  lport);
 }
 
 static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t,
-					    const struct ip_vs_conn *cp)
+					    const struct ip_vs_conn *cp,
+					    bool out)
 {
 	struct ip_vs_conn_param p;
 
-	ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
-			      &cp->caddr, cp->cport, NULL, 0, &p);
+	if (!out)
+		ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
+				      &cp->caddr, cp->cport, &cp->vaddr,
+				      cp->vport, &p);
+	else
+		ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
+				      &cp->daddr, cp->dport, &cp->caddr,
+				      cp->cport, &p);
 
 	if (cp->pe) {
 		p.pe = cp->pe;
@@ -173,7 +234,7 @@ static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t,
 		p.pe_data_len = cp->pe_data_len;
 	}
 
-	return ip_vs_conn_hashkey_param(&p, t, false);
+	return ip_vs_conn_hashkey_param(&p, t, out);
 }
 
 /*	Hashes ip_vs_conn in conn_tab
@@ -182,9 +243,11 @@ static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t,
 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 {
 	struct netns_ipvs *ipvs = cp->ipvs;
-	struct hlist_bl_head *head;
+	struct hlist_bl_head *head, *head2;
+	u32 hash_key, hash_key2;
 	struct ip_vs_rht *t;
-	u32 hash_key;
+	u32 hash, hash2;
+	bool use2;
 	int ret;
 
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
@@ -194,15 +257,28 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	t = rcu_dereference(ipvs->conn_tab);
 	t = rcu_dereference(t->new_tbl);
 
-	hash_key = ip_vs_rht_build_hash_key(t, ip_vs_conn_hashkey_conn(t, cp));
-	conn_tab_lock(t, cp, hash_key, true /* new_hash */, &head);
+	hash = ip_vs_conn_hashkey_conn(t, cp, false);
+	hash_key = ip_vs_rht_build_hash_key(t, hash);
+	if (ip_vs_conn_use_hash2(cp)) {
+		hash2 = ip_vs_conn_hashkey_conn(t, cp, true);
+		hash_key2 = ip_vs_rht_build_hash_key(t, hash2);
+		use2 = true;
+	} else {
+		hash_key2 = hash_key;
+		use2 = false;
+	}
+	conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
+		      &head, &head2);
 	spin_lock(&cp->lock);
 
 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
 		cp->flags |= IP_VS_CONN_F_HASHED;
-		WRITE_ONCE(cp->hash_key, hash_key);
+		WRITE_ONCE(cp->hn0.hash_key, hash_key);
+		WRITE_ONCE(cp->hn1.hash_key, hash_key2);
 		refcount_inc(&cp->refcnt);
-		hlist_bl_add_head_rcu(&cp->c_list, head);
+		hlist_bl_add_head_rcu(&cp->hn0.node, head);
+		if (use2)
+			hlist_bl_add_head_rcu(&cp->hn1.node, head2);
 		ret = 1;
 	} else {
 		pr_err("%s(): request for already hashed, called from %pS\n",
@@ -211,7 +287,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	}
 
 	spin_unlock(&cp->lock);
-	conn_tab_unlock(head);
+	conn_tab_unlock(head, head2);
 
 	/* Schedule resizing if load increases */
 	if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
@@ -227,10 +303,11 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 {
 	struct netns_ipvs *ipvs = cp->ipvs;
-	struct hlist_bl_head *head;
+	struct hlist_bl_head *head, *head2;
+	u32 hash_key, hash_key2;
 	struct ip_vs_rht *t;
 	bool ret = false;
-	u32 hash_key;
+	bool use2;
 
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return refcount_dec_if_one(&cp->refcnt);
@@ -238,22 +315,27 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 	rcu_read_lock();
 
 	t = rcu_dereference(ipvs->conn_tab);
-	hash_key = READ_ONCE(cp->hash_key);
+	hash_key = READ_ONCE(cp->hn0.hash_key);
+	hash_key2 = READ_ONCE(cp->hn1.hash_key);
+	use2 = ip_vs_conn_use_hash2(cp);
 
-	conn_tab_lock(t, cp, hash_key, false /* new_hash */, &head);
+	conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
+		      &head, &head2);
 	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
 		/* Decrease refcnt and unlink conn only if we are last user */
 		if (refcount_dec_if_one(&cp->refcnt)) {
-			hlist_bl_del_rcu(&cp->c_list);
+			hlist_bl_del_rcu(&cp->hn0.node);
+			if (use2)
+				hlist_bl_del_rcu(&cp->hn1.node);
 			cp->flags &= ~IP_VS_CONN_F_HASHED;
 			ret = true;
 		}
 	}
 
 	spin_unlock(&cp->lock);
-	conn_tab_unlock(head);
+	conn_tab_unlock(head, head2);
 
 	rcu_read_unlock();
 
@@ -272,6 +354,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
 	struct netns_ipvs *ipvs = p->ipvs;
+	struct ip_vs_conn_hnode *hn;
 	struct hlist_bl_head *head;
 	struct ip_vs_rht *t, *pt;
 	struct hlist_bl_node *e;
@@ -284,9 +367,12 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 		hash = ip_vs_conn_hashkey_param(p, t, false);
 		hash_key = ip_vs_rht_build_hash_key(t, hash);
 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
-			hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
-				if (READ_ONCE(cp->hash_key) == hash_key &&
-				    p->cport == cp->cport &&
+			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+				if (READ_ONCE(hn->hash_key) != hash_key ||
+				    hn->dir != 0)
+					continue;
+				cp = ip_vs_hn0_to_conn(hn);
+				if (p->cport == cp->cport &&
 				    p->vport == cp->vport && cp->af == p->af &&
 				    ip_vs_addr_equal(p->af, p->caddr,
 						     &cp->caddr) &&
@@ -376,6 +462,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
 	struct netns_ipvs *ipvs = p->ipvs;
+	struct ip_vs_conn_hnode *hn;
 	struct hlist_bl_head *head;
 	struct ip_vs_rht *t, *pt;
 	struct hlist_bl_node *e;
@@ -388,9 +475,11 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 		hash = ip_vs_conn_hashkey_param(p, t, false);
 		hash_key = ip_vs_rht_build_hash_key(t, hash);
 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
-			hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
-				if (READ_ONCE(cp->hash_key) != hash_key)
+			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+				if (READ_ONCE(hn->hash_key) != hash_key ||
+				    hn->dir != 0)
 					continue;
+				cp = ip_vs_hn0_to_conn(hn);
 				if (unlikely(p->pe_data && p->pe->ct_match)) {
 					if (p->pe == cp->pe &&
 					    p->pe->ct_match(p, cp) &&
@@ -442,6 +531,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
 	struct netns_ipvs *ipvs = p->ipvs;
 	const union nf_inet_addr *saddr;
+	struct ip_vs_conn_hnode *hn;
 	struct hlist_bl_head *head;
 	struct ip_vs_rht *t, *pt;
 	struct hlist_bl_node *e;
@@ -455,9 +545,12 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 		hash = ip_vs_conn_hashkey_param(p, t, true);
 		hash_key = ip_vs_rht_build_hash_key(t, hash);
 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
-			hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
-				if (READ_ONCE(cp->hash_key) != hash_key ||
-				    p->vport != cp->cport)
+			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+				/* dir can be 0 for DR/TUN */
+				if (READ_ONCE(hn->hash_key) != hash_key)
+					continue;
+				cp = ip_vs_hn_to_conn(hn);
+				if (p->vport != cp->cport)
 					continue;
 
 				if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
@@ -536,21 +629,33 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 {
 	struct hlist_bl_head *head, *head2, *head_new;
+	bool use2 = ip_vs_conn_use_hash2(cp);
 	struct netns_ipvs *ipvs = cp->ipvs;
 	int af_id = ip_vs_af_index(cp->af);
 	u32 hash_r = 0, hash_key_r = 0;
 	struct ip_vs_rht *t, *tp, *t2;
+	struct ip_vs_conn_hnode *hn;
 	u32 hash_key, hash_key_new;
 	struct ip_vs_conn_param p;
 	int ntbl;
+	int dir;
 
-	ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr,
-			      cport, &cp->vaddr, cp->vport, &p);
+	/* No packets from inside, so we can do it in 2 steps. */
+	dir = use2 ? 1 : 0;
+
+next_dir:
+	if (dir)
+		ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr,
+				      cp->dport, &cp->caddr, cport, &p);
+	else
+		ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr,
+				      cport, &cp->vaddr, cp->vport, &p);
+	hn = dir ? &cp->hn1 : &cp->hn0;
 	ntbl = 0;
 
 	/* Attempt to rehash cp safely, by informing seqcount readers */
 	t = rcu_dereference(ipvs->conn_tab);
-	hash_key = READ_ONCE(cp->hash_key);
+	hash_key = READ_ONCE(hn->hash_key);
 	tp = NULL;
 
 retry:
@@ -567,7 +672,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 	t2 = rcu_dereference(t->new_tbl);
 	/* Calc new hash once per table */
 	if (tp != t2) {
-		hash_r = ip_vs_conn_hashkey_param(&p, t2, false);
+		hash_r = ip_vs_conn_hashkey_param(&p, t2, dir);
 		hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r);
 		tp = t2;
 	}
@@ -591,7 +696,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 		hlist_bl_lock(head2);
 
 	/* Ensure hash_key is read under lock */
-	hash_key_new = READ_ONCE(cp->hash_key);
+	hash_key_new = READ_ONCE(hn->hash_key);
 	/* Racing with another rehashing ? */
 	if (unlikely(hash_key != hash_key_new)) {
 		if (head != head2)
@@ -611,14 +716,21 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 		 * parameters in cp do not change, i.e. cport is
 		 * the only possible change.
 		 */
-		WRITE_ONCE(cp->hash_key, hash_key_r);
+		WRITE_ONCE(hn->hash_key, hash_key_r);
+		if (!use2)
+			WRITE_ONCE(cp->hn1.hash_key, hash_key_r);
+		/* For dir=1 we do not check in flags if hn is already
+		 * rehashed but this check will do it.
+		 */
 		if (head != head2) {
-			hlist_bl_del_rcu(&cp->c_list);
-			hlist_bl_add_head_rcu(&cp->c_list, head_new);
+			hlist_bl_del_rcu(&hn->node);
+			hlist_bl_add_head_rcu(&hn->node, head_new);
+		}
+		if (!dir) {
+			atomic_dec(&ipvs->no_cport_conns[af_id]);
+			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+			cp->cport = cport;
 		}
-		atomic_dec(&ipvs->no_cport_conns[af_id]);
-		cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
-		cp->cport = cport;
 	}
 	spin_unlock(&cp->lock);
 
@@ -628,6 +740,8 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 	write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
 	preempt_enable_nested();
 	spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+	if (dir--)
+		goto next_dir;
 }
 
 /* Get default load factor to map conn_count/u_thresh to t->size */
@@ -639,6 +753,8 @@ static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs)
 		factor = 3;
 	else
 		factor = 1;
+	/* Double hashing adds twice more nodes for NAT */
+	factor++;
 	return factor;
 }
 
@@ -679,6 +795,7 @@ static void conn_resize_work_handler(struct work_struct *work)
 	unsigned int resched_score = 0;
 	struct hlist_bl_node *cn, *nn;
 	struct ip_vs_rht *t, *t_new;
+	struct ip_vs_conn_hnode *hn;
 	struct netns_ipvs *ipvs;
 	struct ip_vs_conn *cp;
 	bool more_work = false;
@@ -747,8 +864,9 @@ static void conn_resize_work_handler(struct work_struct *work)
 		write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]);
 		hlist_bl_lock(head);
 
-		hlist_bl_for_each_entry_safe(cp, cn, nn, head, c_list) {
-			hash = ip_vs_conn_hashkey_conn(t_new, cp);
+		hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) {
+			cp = ip_vs_hn_to_conn(hn);
+			hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir);
 			hash_key = ip_vs_rht_build_hash_key(t_new, hash);
 
 			head2 = t_new->buckets + (hash & t_new->mask);
@@ -756,9 +874,12 @@ static void conn_resize_work_handler(struct work_struct *work)
 			/* t_new->seqc are not used at this stage, we race
 			 * only with add/del, so only lock the bucket.
 			 */
-			hlist_bl_del_rcu(&cp->c_list);
-			WRITE_ONCE(cp->hash_key, hash_key);
-			hlist_bl_add_head_rcu(&cp->c_list, head2);
+			hlist_bl_del_rcu(&hn->node);
+			WRITE_ONCE(hn->hash_key, hash_key);
+			/* Keep both hash keys in sync if no double hashing */
+			if (!ip_vs_conn_use_hash2(cp))
+				WRITE_ONCE(cp->hn1.hash_key, hash_key);
+			hlist_bl_add_head_rcu(&hn->node, head2);
 			hlist_bl_unlock(head2);
 			/* Too long chain? Do it in steps */
 			if (++limit >= 64)
@@ -1236,10 +1357,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 		return NULL;
 	}
 
-	INIT_HLIST_BL_NODE(&cp->c_list);
+	INIT_HLIST_BL_NODE(&cp->hn0.node);
+	INIT_HLIST_BL_NODE(&cp->hn1.node);
 	timer_setup(&cp->timer, ip_vs_conn_expire, 0);
 	cp->ipvs	   = ipvs;
+	cp->hn0.dir	   = 0;
 	cp->af		   = p->af;
+	cp->hn1.dir	   = 1;
 	cp->daf		   = dest_af;
 	cp->protocol	   = p->protocol;
 	ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
@@ -1343,22 +1467,24 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 	struct net *net = seq_file_net(seq);
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_rht *t = iter->t;
+	struct ip_vs_conn_hnode *hn;
 	struct hlist_bl_node *e;
-	struct ip_vs_conn *cp;
 	int idx;
 
 	if (!t)
 		return NULL;
 	for (idx = 0; idx < t->size; idx++) {
-		hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[idx], c_list) {
+		hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) {
 			/* __ip_vs_conn_get() is not needed by
 			 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
 			 */
-			if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key)))
+			if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
 				break;
+			if (hn->dir != 0)
+				continue;
 			if (pos-- == 0) {
 				iter->bucket = idx;
-				return cp;
+				return hn;
 			}
 		}
 		if (!(idx & 31)) {
@@ -1391,7 +1517,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	struct ip_vs_iter_state *iter = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct netns_ipvs *ipvs = net_ipvs(net);
-	struct ip_vs_conn *cp = v;
+	struct ip_vs_conn_hnode *hn = v;
 	struct hlist_bl_node *e;
 	struct ip_vs_rht *t;
 
@@ -1404,19 +1530,21 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		return NULL;
 
 	/* more on same hash chain? */
-	hlist_bl_for_each_entry_continue_rcu(cp, e, c_list) {
+	hlist_bl_for_each_entry_continue_rcu(hn, e, node) {
 		/* Our cursor was moved to new table ? */
-		if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key)))
+		if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
 			break;
-		return cp;
+		if (!hn->dir)
+			return hn;
 	}
 
 	while (++iter->bucket < t->size) {
-		hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[iter->bucket],
-					    c_list) {
-			if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key)))
+		hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[iter->bucket],
+					    node) {
+			if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
 				break;
-			return cp;
+			if (!hn->dir)
+				return hn;
 		}
 		if (!(iter->bucket & 31)) {
 			cond_resched_rcu();
@@ -1441,7 +1569,8 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq,
    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
-		const struct ip_vs_conn *cp = v;
+		struct ip_vs_conn_hnode *hn = v;
+		const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn);
 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
 		size_t len = 0;
 		char dbuf[IP_VS_ADDRSTRLEN];
@@ -1608,6 +1737,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
 
 void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 {
+	struct ip_vs_conn_hnode *hn;
 	struct hlist_bl_node *e;
 	struct ip_vs_conn *cp;
 	struct ip_vs_rht *t;
@@ -1628,7 +1758,10 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 		/* Don't care if due to moved entry we jump to another bucket
 		 * and even to new table
 		 */
-		hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[hash], c_list) {
+		hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) {
+			if (hn->dir != 0)
+				continue;
+			cp = ip_vs_hn0_to_conn(hn);
 			if (atomic_read(&cp->n_control))
 				continue;
 			if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
@@ -1693,6 +1826,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
 {
 	DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
 	struct ip_vs_conn *cp, *cp_c;
+	struct ip_vs_conn_hnode *hn;
 	struct hlist_bl_head *head;
 	struct ip_vs_rht *t, *p;
 	struct hlist_bl_node *e;
@@ -1707,7 +1841,10 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
 	/* Rely on RCU grace period while accessing cp after ip_vs_conn_del */
 	rcu_read_lock();
 	ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) {
-		hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
+		hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+			if (hn->dir != 0)
+				continue;
+			cp = ip_vs_hn0_to_conn(hn);
 			if (atomic_read(&cp->n_control))
 				continue;
 			cp_c = cp->control;
@@ -1754,6 +1891,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
 	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
 	unsigned int resched_score = 0;
 	struct ip_vs_conn *cp, *cp_c;
+	struct ip_vs_conn_hnode *hn;
 	struct hlist_bl_head *head;
 	struct ip_vs_dest *dest;
 	struct hlist_bl_node *e;
@@ -1767,7 +1905,10 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
 repeat:
 	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
 	ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) {
-		hlist_bl_for_each_entry_rcu(cp, e, head, c_list) {
+		hlist_bl_for_each_entry_rcu(hn, e, head, node) {
+			if (hn->dir != 0)
+				continue;
+			cp = ip_vs_hn0_to_conn(hn);
 			resched_score++;
 			dest = cp->dest;
 			if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))

From patchwork Tue Dec 12 16:24:43 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489591
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="GYIf+WbN"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 37B1311A;
	Tue, 12 Dec 2023 08:30:33 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id C627D1E1EA;
	Tue, 12 Dec 2023 18:30:31 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id AD3221E0A0;
	Tue, 12 Dec 2023 18:30:31 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 82DE83C07D3;
	Tue, 12 Dec 2023 18:30:12 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398612; bh=dgzP0Aka2MFEYSY/vQr/tclATQ85/eEXckkKio8S4kU=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=GYIf+WbNseULfrViQxXCP59wdqp/2rAYXKfDX7zCRNsqOyEzfyTvEGcwI6rlvoIto
	 pnLCyzGPZ1JewQ+wt1pjrS5WXHdO+IHBDDQXsGUG5SWLMxPEuBGh25kSXXJ/Wz0txQ
	 e9j6m1TGQa+73jNHHs3PiFTWwHz06KTbrezQCfGg=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQWJI094103;
	Tue, 12 Dec 2023 18:26:32 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQWfF094102;
	Tue, 12 Dec 2023 18:26:32 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 13/14] ipvs: add ip_vs_status info
Date: Tue, 12 Dec 2023 18:24:43 +0200
Message-ID: <20231212162444.93801-14-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Add /proc/net/ip_vs_status to show current state of IPVS.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 145 +++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 2cc51e1c6328..802447106959 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2913,6 +2913,144 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 
 	return 0;
 }
+
+static int ip_vs_status_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq_file_single_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	unsigned int resched_score = 0;
+	struct ip_vs_conn_hnode *hn;
+	struct hlist_bl_head *head;
+	struct ip_vs_service *svc;
+	struct ip_vs_rht *t, *pt;
+	struct hlist_bl_node *e;
+	int old_gen, new_gen;
+	u32 counts[8];
+	u32 bucket;
+	int count;
+	u32 sum1;
+	u32 sum;
+	int i;
+
+	rcu_read_lock();
+
+	t = rcu_dereference(ipvs->conn_tab);
+
+	seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count));
+	seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n",
+		   t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
+
+	if (!atomic_read(&ipvs->conn_count))
+		goto after_conns;
+	old_gen = atomic_read(&ipvs->conn_tab_changes);
+
+repeat_conn:
+	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
+	memset(counts, 0, sizeof(counts));
+	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+		for (bucket = 0; bucket < t->size; bucket++) {
+			DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+
+			count = 0;
+			resched_score++;
+			ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
+				count = 0;
+				hlist_bl_for_each_entry_rcu(hn, e, head, node)
+					count++;
+			}
+			resched_score += count;
+			if (resched_score >= 100) {
+				resched_score = 0;
+				cond_resched_rcu();
+				new_gen = atomic_read(&ipvs->conn_tab_changes);
+				/* New table installed ? */
+				if (old_gen != new_gen) {
+					old_gen = new_gen;
+					goto repeat_conn;
+				}
+			}
+			counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+		}
+	}
+	for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
+		sum += counts[i];
+	sum1 = sum - counts[0];
+	seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n",
+		   counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+	for (i = 1; i < ARRAY_SIZE(counts); i++) {
+		if (!counts[i])
+			continue;
+		seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n",
+			   i, counts[i],
+			   (unsigned long)counts[i] * 100 / max(sum1, 1U));
+	}
+
+after_conns:
+	t = rcu_dereference(ipvs->svc_table);
+
+	count = ip_vs_get_num_services(ipvs);
+	seq_printf(seq, "Services:\t%d\n", count);
+	seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
+		   t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
+
+	if (!count)
+		goto after_svc;
+	old_gen = atomic_read(&ipvs->svc_table_changes);
+
+repeat_svc:
+	smp_rmb(); /* ipvs->svc_table and svc_table_changes */
+	memset(counts, 0, sizeof(counts));
+	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) {
+		for (bucket = 0; bucket < t->size; bucket++) {
+			DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+
+			count = 0;
+			resched_score++;
+			ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
+				count = 0;
+				hlist_bl_for_each_entry_rcu(svc, e, head,
+							    s_list)
+					count++;
+			}
+			resched_score += count;
+			if (resched_score >= 100) {
+				resched_score = 0;
+				cond_resched_rcu();
+				new_gen = atomic_read(&ipvs->svc_table_changes);
+				/* New table installed ? */
+				if (old_gen != new_gen) {
+					old_gen = new_gen;
+					goto repeat_svc;
+				}
+			}
+			counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+		}
+	}
+	for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
+		sum += counts[i];
+	sum1 = sum - counts[0];
+	seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n",
+		   counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+	for (i = 1; i < ARRAY_SIZE(counts); i++) {
+		if (!counts[i])
+			continue;
+		seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n",
+			   i, counts[i],
+			   (unsigned long)counts[i] * 100 / max(sum1, 1U));
+	}
+
+after_svc:
+	seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n",
+		   ipvs->est_kt_count, ipvs->est_max_threads);
+	seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max);
+	seq_printf(seq, "Stats thread ests:\t%d\n",
+		   ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR *
+		   IPVS_EST_NTICKS);
+
+	rcu_read_unlock();
+	return 0;
+}
+
 #endif
 
 /*
@@ -4827,6 +4965,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 				    ipvs->net->proc_net,
 				    ip_vs_stats_percpu_show, NULL))
 		goto err_percpu;
+	if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net,
+				    ip_vs_status_show, NULL))
+		goto err_status;
 #endif
 
 	ret = ip_vs_control_net_init_sysctl(ipvs);
@@ -4837,6 +4978,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 
 err:
 #ifdef CONFIG_PROC_FS
+	remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
+
+err_status:
 	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
 
 err_percpu:
@@ -4862,6 +5006,7 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
 	ip_vs_control_net_cleanup_sysctl(ipvs);
 	cancel_delayed_work_sync(&ipvs->est_reload_work);
 #ifdef CONFIG_PROC_FS
+	remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
 	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
 	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
 	remove_proc_entry("ip_vs", ipvs->net->proc_net);

From patchwork Tue Dec 12 16:24:44 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Julian Anastasov <ja@ssi.bg>
X-Patchwork-Id: 13489589
X-Patchwork-Delegate: kuba@kernel.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="Gev/0u/s"
Received: from mg.ssi.bg (mg.ssi.bg [193.238.174.37])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B7149E8;
	Tue, 12 Dec 2023 08:30:24 -0800 (PST)
Received: from mg.bb.i.ssi.bg (localhost [127.0.0.1])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTP id 25FFB1DE50;
	Tue, 12 Dec 2023 18:30:23 +0200 (EET)
Received: from ink.ssi.bg (ink.ssi.bg [193.238.174.40])
	by mg.bb.i.ssi.bg (Proxmox) with ESMTPS id 04A611DE4F;
	Tue, 12 Dec 2023 18:30:23 +0200 (EET)
Received: from ja.ssi.bg (unknown [213.16.62.126])
	by ink.ssi.bg (Postfix) with ESMTPSA id 34B173C07D1;
	Tue, 12 Dec 2023 18:30:12 +0200 (EET)
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=ssi.bg; s=ink;
	t=1702398612; bh=mUCkdfyyh2jiVCW0T3nBduEaNuO4SA1RzVDx0bZ7GEs=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References;
	b=Gev/0u/sjiVVASoIADjHiSx1L3LnO8CFBYfqnRSN7wxvqtxJfEbneKp2O9RRMpHRd
	 MXKVbtTciqOMZzJ0rl9D7NgkMiw5VsxB4uMrUC1QxWgGMyPZJpCUiQMbzgl2EZs4W3
	 11XLUGj5ISVFZ5ZohwkosZGO2tW37SzkXEcsy+yk=
Received: from ja.home.ssi.bg (localhost.localdomain [127.0.0.1])
	by ja.ssi.bg (8.17.1/8.17.1) with ESMTP id 3BCGQWrL094107;
	Tue, 12 Dec 2023 18:26:32 +0200
Received: (from root@localhost)
	by ja.home.ssi.bg (8.17.1/8.17.1/Submit) id 3BCGQW0o094106;
	Tue, 12 Dec 2023 18:26:32 +0200
From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
        netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>,
        Jiejian Wu <jiejian@linux.alibaba.com>,
        Jiri Wiesner <jwiesner@suse.de>
Subject: [PATCHv2 RFC net-next 14/14] ipvs: add conn_lfactor and svc_lfactor
 sysctl vars
Date: Tue, 12 Dec 2023 18:24:44 +0200
Message-ID: <20231212162444.93801-15-ja@ssi.bg>
X-Mailer: git-send-email 2.43.0
In-Reply-To: <20231212162444.93801-1-ja@ssi.bg>
References: <20231212162444.93801-1-ja@ssi.bg>
Precedence: bulk
X-Mailing-List: netdev@vger.kernel.org
List-Id: <netdev.vger.kernel.org>
List-Subscribe: <mailto:netdev+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:netdev+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Patchwork-Delegate: kuba@kernel.org
X-Patchwork-State: RFC

Allow the default load factor for the connection and service tables
to be configured.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 Documentation/networking/ipvs-sysctl.rst | 31 ++++++++++
 net/netfilter/ipvs/ip_vs_ctl.c           | 72 ++++++++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst
index 3fb5fa142eef..61fdc0ec4c39 100644
--- a/Documentation/networking/ipvs-sysctl.rst
+++ b/Documentation/networking/ipvs-sysctl.rst
@@ -29,6 +29,28 @@ backup_only - BOOLEAN
 	If set, disable the director function while the server is
 	in backup mode to avoid packet loops for DR/TUN methods.
 
+conn_lfactor - INTEGER
+	4 - default
+	Valid range: -8 (smaller table) .. 8 (larger table)
+
+	Controls the sizing of the connection hash table based on the
+	load factor (number of connections per table buckets).
+	As result, the table grows if load increases and shrinks when
+	load decreases in the range of 2^8 - 2^conn_tab_bits (module
+	parameter).
+	The value is a shift count where positive values select
+	buckets = (connection hash nodes << value) while negative
+	values select buckets = (connection hash nodes >> value). The
+	positive values reduce the collisions and reduce the time for
+	lookups but increase the table size. Negative values will
+	tolerate load above 100% when using smaller table is
+	preferred. If using NAT connections consider increasing the
+	value with one because they add two nodes in the hash table.
+
+	Example:
+	4: grow if load goes above 6% (buckets = nodes * 16)
+	-2: grow if load goes above 400% (buckets = nodes / 4)
+
 conn_reuse_mode - INTEGER
 	1 - default
 
@@ -219,6 +241,15 @@ secure_tcp - INTEGER
 	The value definition is the same as that of drop_entry and
 	drop_packet.
 
+svc_lfactor - INTEGER
+	3 - default
+	Valid range: -8 (smaller table) .. 8 (larger table)
+
+	Controls the sizing of the service hash table based on the
+	load factor (number of services per table buckets). The table
+	will grow and shrink in the range of 2^4 - 2^20.
+	See conn_lfactor for explanation.
+
 sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period
 	default 3 50
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 802447106959..e717c1cdf59c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2430,6 +2430,60 @@ static int ipvs_proc_run_estimation(struct ctl_table *table, int write,
 	return ret;
 }
 
+static int ipvs_proc_conn_lfactor(struct ctl_table *table, int write,
+				  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct netns_ipvs *ipvs = table->extra2;
+	int *valp = table->data;
+	int val = *valp;
+	int ret;
+
+	struct ctl_table tmp_table = {
+		.data = &val,
+		.maxlen = sizeof(int),
+	};
+
+	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+	if (write && ret >= 0) {
+		if (val < -8 || val > 8) {
+			ret = -EINVAL;
+		} else {
+			*valp = val;
+			if (rcu_dereference_protected(ipvs->conn_tab, 1))
+				mod_delayed_work(system_unbound_wq,
+						 &ipvs->conn_resize_work, 0);
+		}
+	}
+	return ret;
+}
+
+static int ipvs_proc_svc_lfactor(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct netns_ipvs *ipvs = table->extra2;
+	int *valp = table->data;
+	int val = *valp;
+	int ret;
+
+	struct ctl_table tmp_table = {
+		.data = &val,
+		.maxlen = sizeof(int),
+	};
+
+	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+	if (write && ret >= 0) {
+		if (val < -8 || val > 8) {
+			ret = -EINVAL;
+		} else {
+			*valp = val;
+			if (rcu_dereference_protected(ipvs->svc_table, 1))
+				mod_delayed_work(system_unbound_wq,
+						 &ipvs->svc_resize_work, 0);
+		}
+	}
+	return ret;
+}
+
 /*
  *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
  *	Do not change order or insert new entries without
@@ -2618,6 +2672,18 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= ipvs_proc_est_nice,
 	},
+	{
+		.procname	= "conn_lfactor",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= ipvs_proc_conn_lfactor,
+	},
+	{
+		.procname	= "svc_lfactor",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= ipvs_proc_svc_lfactor,
+	},
 #ifdef CONFIG_IP_VS_DEBUG
 	{
 		.procname	= "debug_level",
@@ -4855,6 +4921,12 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 	tbl[idx].extra2 = ipvs;
 	tbl[idx++].data = &ipvs->sysctl_est_nice;
 
+	tbl[idx].extra2 = ipvs;
+	tbl[idx++].data = &ipvs->sysctl_conn_lfactor;
+
+	tbl[idx].extra2 = ipvs;
+	tbl[idx++].data = &ipvs->sysctl_svc_lfactor;
+
 #ifdef CONFIG_IP_VS_DEBUG
 	/* Global sysctls must be ro in non-init netns */
 	if (!net_eq(net, &init_net))