[PATCHv2,RFC,net-next,12/14] ipvs: use more keys for connection hashing

Message ID	20231212162444.93801-13-ja@ssi.bg (mailing list archive)
State	RFC
Delegated to:	Netdev Maintainers
Headers	show Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=ssi.bg header.i=@ssi.bg header.b="WjrYObcb" From: Julian Anastasov <ja@ssi.bg> To: Simon Horman <horms@verge.net.au> Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org, netdev@vger.kernel.org, Dust Li <dust.li@linux.alibaba.com>, Jiejian Wu <jiejian@linux.alibaba.com>, Jiri Wiesner <jwiesner@suse.de> Subject: [PATCHv2 RFC net-next 12/14] ipvs: use more keys for connection hashing Date: Tue, 12 Dec 2023 18:24:42 +0200 Message-ID: <20231212162444.93801-13-ja@ssi.bg> In-Reply-To: <20231212162444.93801-1-ja@ssi.bg> References: <20231212162444.93801-1-ja@ssi.bg> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	ipvs: per-net tables and optimizations \| expand [PATCHv2,RFC,net-next,00/14] ipvs: per-net tables and optimizations [PATCHv2,RFC,net-next,01/14] rculist_bl: add hlist_bl_for_each_entry_continue_rcu [PATCHv2,RFC,net-next,02/14] ipvs: make ip_vs_svc_table and ip_vs_svc_fwm_table per netns [PATCHv2,RFC,net-next,03/14] ipvs: some service readers can use RCU [PATCHv2,RFC,net-next,04/14] ipvs: use single svc table [PATCHv2,RFC,net-next,05/14] ipvs: do not keep dest_dst after dest is removed [PATCHv2,RFC,net-next,06/14] ipvs: use more counters to avoid service lookups [PATCHv2,RFC,net-next,07/14] ipvs: add resizable hash tables [PATCHv2,RFC,net-next,08/14] ipvs: use resizable hash table for services [PATCHv2,RFC,net-next,09/14] ipvs: switch to per-net connection table [PATCHv2,RFC,net-next,10/14] ipvs: show the current conn_tab size to users [PATCHv2,RFC,net-next,11/14] ipvs: no_cport and dropentry counters can be per-net [PATCHv2,RFC,net-next,12/14] ipvs: use more keys for connection hashing [PATCHv2,RFC,net-next,13/14] ipvs: add ip_vs_status info [PATCHv2,RFC,net-next,14/14] ipvs: add conn_lfactor and svc_lfactor sysctl vars

Context	Check	Description
netdev/series_format	success	Posting correctly formatted
netdev/tree_selection	success	Clearly marked for net-next
netdev/ynl	success	Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present	success	Fixes tag not required for -next series
netdev/header_inline	success	No static functions without inline keyword in header files
netdev/build_32bit	fail	Errors and warnings before: 1117 this patch: 1119
netdev/cc_maintainers	warning	8 maintainers not CCed: pablo@netfilter.org kadlec@netfilter.org coreteam@netfilter.org kuba@kernel.org dsahern@kernel.org fw@strlen.de pabeni@redhat.com edumazet@google.com
netdev/build_clang	success	Errors and warnings before: 1142 this patch: 1142
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/deprecated_api	success	None detected
netdev/check_selftest	success	No net selftest shell script
netdev/verify_fixes	success	No Fixes tag
netdev/build_allmodconfig_warn	fail	Errors and warnings before: 1144 this patch: 1146
netdev/checkpatch	warning	WARNING: Use of volatile is usually wrong: see Documentation/process/volatile-considered-harmful.rst
netdev/build_clang_rust	success	No Rust files in patch. Skipping build
netdev/kdoc	success	Errors and warnings before: 1 this patch: 1
netdev/source_inline	warning	Was 1 now: 1

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ed74e4e36f21..5eb11ef9601a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -785,51 +785,48 @@ struct ip_vs_conn_param { __u8 pe_data_len; }; +/* Hash node in conn_tab */ +struct ip_vs_conn_hnode { + struct hlist_bl_node node; /* node in conn_tab */ + u32 hash_key; /* Key for the hash table */ + u8 dir; /* 0=out->in, 1=in->out */ +} __packed; + /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { - struct hlist_bl_node c_list; /* node in conn_tab */ - __u32 hash_key; /* Key for the hash table */ - /* Protocol, addresses and port numbers */ + /* Cacheline for hash table nodes - rarely modified */ + + struct ip_vs_conn_hnode hn0; /* Original direction */ + u8 af; /* address family */ __be16 cport; + struct ip_vs_conn_hnode hn1; /* Reply direction */ + u8 daf; /* Address family of the dest */ __be16 dport; - __be16 vport; - u16 af; /* address family */ - __u16 protocol; /* Which protocol (TCP/UDP) */ - __u16 daf; /* Address family of the dest */ - union nf_inet_addr caddr; /* client address */ - union nf_inet_addr vaddr; /* virtual address */ - union nf_inet_addr daddr; /* destination address */ + struct ip_vs_dest *dest; /* real server */ + atomic_t n_control; /* Number of controlled ones */ volatile __u32 flags; /* status flags */ - struct netns_ipvs *ipvs; - - /* counter and timer */ - refcount_t refcnt; /* reference count */ - struct timer_list timer; /* Expiration timer */ - volatile unsigned long timeout; /* timeout */ + /* 44/64 */ - /* Flags and state transition */ - spinlock_t lock; /* lock for state transition */ + struct ip_vs_conn *control; /* Master control connection */ + const struct ip_vs_pe *pe; + char *pe_data; + __u8 pe_data_len; volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization */ - __u32 fwmark; /* Fire wall mark from skb */ - unsigned long sync_endtime; /* jiffies + sent_retries */ + /* 2-byte hole */ + /* 64/96 */ - /* Control members */ - struct ip_vs_conn *control; /* Master control connection */ - atomic_t n_control; /* Number of controlled ones */ - struct ip_vs_dest *dest; /* real server */ - atomic_t in_pkts; /* incoming packet counter */ + union nf_inet_addr caddr; /* client address */ + union nf_inet_addr vaddr; /* virtual address */ + /* 96/128 */ - /* Packet transmitter for different forwarding methods. If it - * mangles the packet, it must return NF_DROP or better NF_STOLEN, - * otherwise this must be changed to a sk_buff **. - * NF_ACCEPT can be returned when destination is local. - */ - int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); + union nf_inet_addr daddr; /* destination address */ + __u32 fwmark; /* Fire wall mark from skb */ + __be16 vport; + __u16 protocol; /* Which protocol (TCP/UDP) */ /* Note: we can group the following members into a structure, * in order to save more space, and the following members are @@ -837,14 +834,31 @@ struct ip_vs_conn { */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ + /* 128/168 */ struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ ); + /* 152/192 */ - const struct ip_vs_pe *pe; - char *pe_data; - __u8 pe_data_len; + struct timer_list timer; /* Expiration timer */ + volatile unsigned long timeout; /* timeout */ + spinlock_t lock; /* lock for state transition */ + refcount_t refcnt; /* reference count */ + atomic_t in_pkts; /* incoming packet counter */ + /* 64-bit: 4-byte gap */ + + /* 188/256 */ + unsigned long sync_endtime; /* jiffies + sent_retries */ + struct netns_ipvs *ipvs; + + /* Packet transmitter for different forwarding methods. If it + * mangles the packet, it must return NF_DROP or better NF_STOLEN, + * otherwise this must be changed to a sk_buff **. + * NF_ACCEPT can be returned when destination is local. + */ + int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); struct rcu_head rcu_head; }; @@ -1614,6 +1628,19 @@ int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, int lfactor); +static inline struct ip_vs_conn * +ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn) +{ + return container_of(hn, struct ip_vs_conn, hn0); +} + +static inline struct ip_vs_conn * +ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn) +{ + return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) : + container_of(hn, struct ip_vs_conn, hn0); +} + struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, @@ -1966,6 +1993,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) return fwd; } +/* Check if connection uses double hashing */ +static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp) +{ + return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ && + !(cp->flags & IP_VS_CONN_F_TEMPLATE); +} + void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a4c1f78687fb..e2fba5559df6 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -76,11 +76,19 @@ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; /* Lock conn_tab bucket for conn hash/unhash, not for rehash */ static __always_inline void conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key, - bool new_hash, struct hlist_bl_head **head_ret) + u32 hash_key2, bool use2, bool new_hash, + struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret) { - struct hlist_bl_head *head; - u32 hash_key_new; + struct hlist_bl_head *head, *head2; + u32 hash_key_new, hash_key_new2; + struct ip_vs_rht *t2 = t; + u32 idx, idx2; + idx = hash_key & t->mask; + if (use2) + idx2 = hash_key2 & t->mask; + else + idx2 = idx; if (!new_hash) { /* We need to lock the bucket in the right table */ @@ -88,31 +96,64 @@ conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key, if (!ip_vs_rht_same_table(t, hash_key)) { /* It is already moved to new table */ t = rcu_dereference(t->new_tbl); + /* Rehashing works in two steps and we may detect + * both nodes in different tables, use idx/idx2 + * for proper lock ordering for heads. + */ + idx = hash_key & t->mask; + idx |= IP_VS_RHT_TABLE_ID_MASK; + } + if (use2) { + if (!ip_vs_rht_same_table(t2, hash_key2)) { + /* It is already moved to new table */ + t2 = rcu_dereference(t2->new_tbl); + idx2 = hash_key2 & t2->mask; + idx2 |= IP_VS_RHT_TABLE_ID_MASK; + } + } else { + idx2 = idx; } } head = t->buckets + (hash_key & t->mask); + head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head; local_bh_disable(); /* Do not touch seqcount, this is a safe operation */ - hlist_bl_lock(head); + if (idx <= idx2) { + hlist_bl_lock(head); + if (head != head2) + hlist_bl_lock(head2); + } else { + hlist_bl_lock(head2); + hlist_bl_lock(head); + } if (!new_hash) { /* Ensure hash_key is read under lock */ - hash_key_new = READ_ONCE(cp->hash_key); + hash_key_new = READ_ONCE(cp->hn0.hash_key); + hash_key_new2 = READ_ONCE(cp->hn1.hash_key); /* Hash changed ? */ - if (hash_key != hash_key_new) { + if (hash_key != hash_key_new || + (hash_key2 != hash_key_new2 && use2)) { + if (head != head2) + hlist_bl_unlock(head2); hlist_bl_unlock(head); local_bh_enable(); hash_key = hash_key_new; + hash_key2 = hash_key_new2; goto retry; } } *head_ret = head; + *head2_ret = head2; } -static inline void conn_tab_unlock(struct hlist_bl_head *head) +static inline void conn_tab_unlock(struct hlist_bl_head *head, + struct hlist_bl_head *head2) { + if (head != head2) + hlist_bl_unlock(head2); hlist_bl_unlock(head); local_bh_enable(); } @@ -123,26 +164,34 @@ static void ip_vs_conn_expire(struct timer_list *t); * Returns hash value for IPVS connection entry */ static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto, - const union nf_inet_addr *addr, __be16 port) + const union nf_inet_addr *addr, __be16 port, + const union nf_inet_addr *laddr, __be16 lport) { u64 a = (u32)proto << 16 | (__force u32)port; + u64 d; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { u64 b = (u64)addr->all[0] << 32 | addr->all[1]; u64 c = (u64)addr->all[2] << 32 | addr->all[3]; - return (u32)siphash_3u64(a, b, c, &t->hash_key); + a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport; + c ^= laddr->all[1]; + d = (u64)laddr->all[0] << 32 | laddr->all[3]; + return (u32)siphash_4u64(a, b, c, d, &t->hash_key); } #endif a |= (u64)addr->all[0] << 32; - return (u32)siphash_1u64(a, &t->hash_key); + d = (u64)laddr->all[0] << 32 | (__force u32)lport; + return (u32)siphash_2u64(a, d, &t->hash_key); } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, struct ip_vs_rht *t, bool inverse) { + const union nf_inet_addr *laddr; const union nf_inet_addr *addr; + __be16 lport; __be16 port; if (p->pe_data && p->pe->hashkey_raw) @@ -151,21 +200,33 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, if (likely(!inverse)) { addr = p->caddr; port = p->cport; + laddr = p->vaddr; + lport = p->vport; } else { addr = p->vaddr; port = p->vport; + laddr = p->caddr; + lport = p->cport; } - return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr, + lport); } static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, - const struct ip_vs_conn *cp) + const struct ip_vs_conn *cp, + bool out) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, - &cp->caddr, cp->cport, NULL, 0, &p); + if (!out) + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, + &cp->caddr, cp->cport, &cp->vaddr, + cp->vport, &p); + else + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, + &cp->daddr, cp->dport, &cp->caddr, + cp->cport, &p); if (cp->pe) { p.pe = cp->pe; @@ -173,7 +234,7 @@ static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, p.pe_data_len = cp->pe_data_len; } - return ip_vs_conn_hashkey_param(&p, t, false); + return ip_vs_conn_hashkey_param(&p, t, out); } /* Hashes ip_vs_conn in conn_tab @@ -182,9 +243,11 @@ static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; - struct hlist_bl_head *head; + struct hlist_bl_head *head, *head2; + u32 hash_key, hash_key2; struct ip_vs_rht *t; - u32 hash_key; + u32 hash, hash2; + bool use2; int ret; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) @@ -194,15 +257,28 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) t = rcu_dereference(ipvs->conn_tab); t = rcu_dereference(t->new_tbl); - hash_key = ip_vs_rht_build_hash_key(t, ip_vs_conn_hashkey_conn(t, cp)); - conn_tab_lock(t, cp, hash_key, true /* new_hash */, &head); + hash = ip_vs_conn_hashkey_conn(t, cp, false); + hash_key = ip_vs_rht_build_hash_key(t, hash); + if (ip_vs_conn_use_hash2(cp)) { + hash2 = ip_vs_conn_hashkey_conn(t, cp, true); + hash_key2 = ip_vs_rht_build_hash_key(t, hash2); + use2 = true; + } else { + hash_key2 = hash_key; + use2 = false; + } + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, + &head, &head2); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { cp->flags |= IP_VS_CONN_F_HASHED; - WRITE_ONCE(cp->hash_key, hash_key); + WRITE_ONCE(cp->hn0.hash_key, hash_key); + WRITE_ONCE(cp->hn1.hash_key, hash_key2); refcount_inc(&cp->refcnt); - hlist_bl_add_head_rcu(&cp->c_list, head); + hlist_bl_add_head_rcu(&cp->hn0.node, head); + if (use2) + hlist_bl_add_head_rcu(&cp->hn1.node, head2); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pS\n", @@ -211,7 +287,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - conn_tab_unlock(head); + conn_tab_unlock(head, head2); /* Schedule resizing if load increases */ if (atomic_read(&ipvs->conn_count) > t->u_thresh && @@ -227,10 +303,11 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; - struct hlist_bl_head *head; + struct hlist_bl_head *head, *head2; + u32 hash_key, hash_key2; struct ip_vs_rht *t; bool ret = false; - u32 hash_key; + bool use2; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return refcount_dec_if_one(&cp->refcnt); @@ -238,22 +315,27 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) rcu_read_lock(); t = rcu_dereference(ipvs->conn_tab); - hash_key = READ_ONCE(cp->hash_key); + hash_key = READ_ONCE(cp->hn0.hash_key); + hash_key2 = READ_ONCE(cp->hn1.hash_key); + use2 = ip_vs_conn_use_hash2(cp); - conn_tab_lock(t, cp, hash_key, false /* new_hash */, &head); + conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, + &head, &head2); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ if (refcount_dec_if_one(&cp->refcnt)) { - hlist_bl_del_rcu(&cp->c_list); + hlist_bl_del_rcu(&cp->hn0.node); + if (use2) + hlist_bl_del_rcu(&cp->hn1.node); cp->flags &= ~IP_VS_CONN_F_HASHED; ret = true; } } spin_unlock(&cp->lock); - conn_tab_unlock(head); + conn_tab_unlock(head, head2); rcu_read_unlock(); @@ -272,6 +354,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *pt; struct hlist_bl_node *e; @@ -284,9 +367,12 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, t, false); hash_key = ip_vs_rht_build_hash_key(t, hash); ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { - if (READ_ONCE(cp->hash_key) == hash_key && - p->cport == cp->cport && + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (READ_ONCE(hn->hash_key) != hash_key || + hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); + if (p->cport == cp->cport && p->vport == cp->vport && cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && @@ -376,6 +462,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *pt; struct hlist_bl_node *e; @@ -388,9 +475,11 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, t, false); hash_key = ip_vs_rht_build_hash_key(t, hash); ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { - if (READ_ONCE(cp->hash_key) != hash_key) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (READ_ONCE(hn->hash_key) != hash_key || + hn->dir != 0) continue; + cp = ip_vs_hn0_to_conn(hn); if (unlikely(p->pe_data && p->pe->ct_match)) { if (p->pe == cp->pe && p->pe->ct_match(p, cp) && @@ -442,6 +531,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); struct netns_ipvs *ipvs = p->ipvs; const union nf_inet_addr *saddr; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *pt; struct hlist_bl_node *e; @@ -455,9 +545,12 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, t, true); hash_key = ip_vs_rht_build_hash_key(t, hash); ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { - if (READ_ONCE(cp->hash_key) != hash_key || - p->vport != cp->cport) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + /* dir can be 0 for DR/TUN */ + if (READ_ONCE(hn->hash_key) != hash_key) + continue; + cp = ip_vs_hn_to_conn(hn); + if (p->vport != cp->cport) continue; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { @@ -536,21 +629,33 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { struct hlist_bl_head *head, *head2, *head_new; + bool use2 = ip_vs_conn_use_hash2(cp); struct netns_ipvs *ipvs = cp->ipvs; int af_id = ip_vs_af_index(cp->af); u32 hash_r = 0, hash_key_r = 0; struct ip_vs_rht *t, *tp, *t2; + struct ip_vs_conn_hnode *hn; u32 hash_key, hash_key_new; struct ip_vs_conn_param p; int ntbl; + int dir; - ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr, - cport, &cp->vaddr, cp->vport, &p); + /* No packets from inside, so we can do it in 2 steps. */ + dir = use2 ? 1 : 0; + +next_dir: + if (dir) + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr, + cp->dport, &cp->caddr, cport, &p); + else + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr, + cport, &cp->vaddr, cp->vport, &p); + hn = dir ? &cp->hn1 : &cp->hn0; ntbl = 0; /* Attempt to rehash cp safely, by informing seqcount readers */ t = rcu_dereference(ipvs->conn_tab); - hash_key = READ_ONCE(cp->hash_key); + hash_key = READ_ONCE(hn->hash_key); tp = NULL; retry: @@ -567,7 +672,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) t2 = rcu_dereference(t->new_tbl); /* Calc new hash once per table */ if (tp != t2) { - hash_r = ip_vs_conn_hashkey_param(&p, t2, false); + hash_r = ip_vs_conn_hashkey_param(&p, t2, dir); hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r); tp = t2; } @@ -591,7 +696,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) hlist_bl_lock(head2); /* Ensure hash_key is read under lock */ - hash_key_new = READ_ONCE(cp->hash_key); + hash_key_new = READ_ONCE(hn->hash_key); /* Racing with another rehashing ? */ if (unlikely(hash_key != hash_key_new)) { if (head != head2) @@ -611,14 +716,21 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) * parameters in cp do not change, i.e. cport is * the only possible change. */ - WRITE_ONCE(cp->hash_key, hash_key_r); + WRITE_ONCE(hn->hash_key, hash_key_r); + if (!use2) + WRITE_ONCE(cp->hn1.hash_key, hash_key_r); + /* For dir=1 we do not check in flags if hn is already + * rehashed but this check will do it. + */ if (head != head2) { - hlist_bl_del_rcu(&cp->c_list); - hlist_bl_add_head_rcu(&cp->c_list, head_new); + hlist_bl_del_rcu(&hn->node); + hlist_bl_add_head_rcu(&hn->node, head_new); + } + if (!dir) { + atomic_dec(&ipvs->no_cport_conns[af_id]); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; } - atomic_dec(&ipvs->no_cport_conns[af_id]); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; } spin_unlock(&cp->lock); @@ -628,6 +740,8 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + if (dir--) + goto next_dir; } /* Get default load factor to map conn_count/u_thresh to t->size */ @@ -639,6 +753,8 @@ static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs) factor = 3; else factor = 1; + /* Double hashing adds twice more nodes for NAT */ + factor++; return factor; } @@ -679,6 +795,7 @@ static void conn_resize_work_handler(struct work_struct *work) unsigned int resched_score = 0; struct hlist_bl_node *cn, *nn; struct ip_vs_rht *t, *t_new; + struct ip_vs_conn_hnode *hn; struct netns_ipvs *ipvs; struct ip_vs_conn *cp; bool more_work = false; @@ -747,8 +864,9 @@ static void conn_resize_work_handler(struct work_struct *work) write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]); hlist_bl_lock(head); - hlist_bl_for_each_entry_safe(cp, cn, nn, head, c_list) { - hash = ip_vs_conn_hashkey_conn(t_new, cp); + hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) { + cp = ip_vs_hn_to_conn(hn); + hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir); hash_key = ip_vs_rht_build_hash_key(t_new, hash); head2 = t_new->buckets + (hash & t_new->mask); @@ -756,9 +874,12 @@ static void conn_resize_work_handler(struct work_struct *work) /* t_new->seqc are not used at this stage, we race * only with add/del, so only lock the bucket. */ - hlist_bl_del_rcu(&cp->c_list); - WRITE_ONCE(cp->hash_key, hash_key); - hlist_bl_add_head_rcu(&cp->c_list, head2); + hlist_bl_del_rcu(&hn->node); + WRITE_ONCE(hn->hash_key, hash_key); + /* Keep both hash keys in sync if no double hashing */ + if (!ip_vs_conn_use_hash2(cp)) + WRITE_ONCE(cp->hn1.hash_key, hash_key); + hlist_bl_add_head_rcu(&hn->node, head2); hlist_bl_unlock(head2); /* Too long chain? Do it in steps */ if (++limit >= 64) @@ -1236,10 +1357,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, return NULL; } - INIT_HLIST_BL_NODE(&cp->c_list); + INIT_HLIST_BL_NODE(&cp->hn0.node); + INIT_HLIST_BL_NODE(&cp->hn1.node); timer_setup(&cp->timer, ip_vs_conn_expire, 0); cp->ipvs = ipvs; + cp->hn0.dir = 0; cp->af = p->af; + cp->hn1.dir = 1; cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); @@ -1343,22 +1467,24 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_rht *t = iter->t; + struct ip_vs_conn_hnode *hn; struct hlist_bl_node *e; - struct ip_vs_conn *cp; int idx; if (!t) return NULL; for (idx = 0; idx < t->size; idx++) { - hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[idx], c_list) { + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ - if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key))) + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) break; + if (hn->dir != 0) + continue; if (pos-- == 0) { iter->bucket = idx; - return cp; + return hn; } } if (!(idx & 31)) { @@ -1391,7 +1517,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct ip_vs_iter_state *iter = seq->private; struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_conn *cp = v; + struct ip_vs_conn_hnode *hn = v; struct hlist_bl_node *e; struct ip_vs_rht *t; @@ -1404,19 +1530,21 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return NULL; /* more on same hash chain? */ - hlist_bl_for_each_entry_continue_rcu(cp, e, c_list) { + hlist_bl_for_each_entry_continue_rcu(hn, e, node) { /* Our cursor was moved to new table ? */ - if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key))) + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) break; - return cp; + if (!hn->dir) + return hn; } while (++iter->bucket < t->size) { - hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[iter->bucket], - c_list) { - if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key))) + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[iter->bucket], + node) { + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) break; - return cp; + if (!hn->dir) + return hn; } if (!(iter->bucket & 31)) { cond_resched_rcu(); @@ -1441,7 +1569,8 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { - const struct ip_vs_conn *cp = v; + struct ip_vs_conn_hnode *hn = v; + const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; char dbuf[IP_VS_ADDRSTRLEN]; @@ -1608,6 +1737,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { + struct ip_vs_conn_hnode *hn; struct hlist_bl_node *e; struct ip_vs_conn *cp; struct ip_vs_rht *t; @@ -1628,7 +1758,10 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) /* Don't care if due to moved entry we jump to another bucket * and even to new table */ - hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[hash], c_list) { + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) { + if (hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); if (atomic_read(&cp->n_control)) continue; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { @@ -1693,6 +1826,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) { DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *p; struct hlist_bl_node *e; @@ -1707,7 +1841,10 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) /* Rely on RCU grace period while accessing cp after ip_vs_conn_del */ rcu_read_lock(); ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); if (atomic_read(&cp->n_control)) continue; cp_c = cp->control; @@ -1754,6 +1891,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); unsigned int resched_score = 0; struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_dest *dest; struct hlist_bl_node *e; @@ -1767,7 +1905,10 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) repeat: smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); resched_score++; dest = cp->dest; if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))

[PATCHv2,RFC,net-next,12/14] ipvs: use more keys for connection hashing

Checks

Commit Message

Patch