@@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u64 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16,
- struct inet_timewait_sock **));
+ struct inet_timewait_sock **,
+ bool rcu_lookup));
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk);
@@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
- struct inet_timewait_sock **twp)
+ struct inet_timewait_sock **twp,
+ bool rcu_lookup)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
@@ -556,17 +557,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
spinlock_t *lock;
- rcu_read_lock();
- sk_nulls_for_each(sk2, node, &head->chain) {
- if (sk2->sk_hash != hash ||
- !inet_match(net, sk2, acookie, ports, dif, sdif))
- continue;
- if (sk2->sk_state == TCP_TIME_WAIT)
- break;
- rcu_read_unlock();
- return -EADDRNOTAVAIL;
+ if (rcu_lookup) {
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash ||
+ !inet_match(net, sk2, acookie, ports, dif, sdif))
+ continue;
+ if (sk2->sk_state == TCP_TIME_WAIT)
+ break;
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
}
- rcu_read_unlock();
lock = inet_ehash_lockp(hinfo, hash);
spin_lock(lock);
@@ -1007,7 +1008,8 @@ static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u64 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
- struct sock *, __u16, struct inet_timewait_sock **))
+ struct sock *, __u16, struct inet_timewait_sock **,
+ bool rcu_lookup))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_bind_hashbucket *head, *head2;
@@ -1025,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
if (port) {
local_bh_disable();
- ret = check_established(death_row, sk, port, NULL);
+ ret = check_established(death_row, sk, port, NULL, false);
local_bh_enable();
return ret;
}
@@ -1061,6 +1063,21 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tb, &head->chain, node) {
+ if (!inet_bind_bucket_match(tb, net, port, l3mdev))
+ continue;
+ if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
+ rcu_read_unlock();
+ goto next_port;
+ }
+ if (!check_established(death_row, sk, port, &tw, true))
+ break;
+ rcu_read_unlock();
+ goto next_port;
+ }
+ rcu_read_unlock();
+
spin_lock_bh(&head->lock);
/* Does not bother with rcv_saddr checks, because
@@ -1070,12 +1087,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
- goto next_port;
+ goto next_port_unlock;
WARN_ON(hlist_empty(&tb->bhash2));
if (!check_established(death_row, sk,
- port, &tw))
+ port, &tw, false))
goto ok;
- goto next_port;
+ goto next_port_unlock;
}
}
@@ -1089,8 +1106,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok;
-next_port:
+next_port_unlock:
spin_unlock_bh(&head->lock);
+next_port:
cond_resched();
}
@@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup);
static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, const __u16 lport,
- struct inet_timewait_sock **twp)
+ struct inet_timewait_sock **twp,
+ bool rcu_lookup)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
@@ -281,17 +282,18 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
spinlock_t *lock;
- rcu_read_lock();
- sk_nulls_for_each(sk2, node, &head->chain) {
- if (sk2->sk_hash != hash ||
- !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))
- continue;
- if (sk2->sk_state == TCP_TIME_WAIT)
- break;
- rcu_read_unlock();
- return -EADDRNOTAVAIL;
+ if (rcu_lookup) {
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash ||
+ !inet6_match(net, sk2, saddr, daddr,
+ ports, dif, sdif))
+ continue;
+ if (sk2->sk_state == TCP_TIME_WAIT)
+ break;
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
}
- rcu_read_unlock();
lock = inet_ehash_lockp(hinfo, hash);
spin_lock(lock);
When __inet_hash_connect() has to try many 4-tuples before finding an available one, we see a high spinlock cost from the many spin_lock_bh(&head->lock) performed in its loop. This patch adds an RCU lookup to avoid the spinlock cost. check_established() gets a new @rcu_lookup argument. First reason is to not make any changes while head->lock is not held. Second reason is to not make this RCU lookup a second time after the spinlock has been acquired. Tested: Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server Before series: utime_start=0.288582 utime_end=1.548707 stime_start=20.637138 stime_end=2002.489845 num_transactions=484453 latency_min=0.156279245 latency_max=20.922042756 latency_mean=1.546521274 latency_stddev=3.936005194 num_samples=312537 throughput=47426.00 perf top on the client: 49.54% [kernel] [k] _raw_spin_lock 25.87% [kernel] [k] _raw_spin_lock_bh 5.97% [kernel] [k] queued_spin_lock_slowpath 5.67% [kernel] [k] __inet_hash_connect 3.53% [kernel] [k] __inet6_check_established 3.48% [kernel] [k] inet6_ehashfn 0.64% [kernel] [k] rcu_all_qs After this series: utime_start=0.271607 utime_end=3.847111 stime_start=18.407684 stime_end=1997.485557 num_transactions=1350742 latency_min=0.014131929 latency_max=17.895073144 latency_mean=0.505675853 # Nice reduction of latency metrics latency_stddev=2.125164772 num_samples=307884 throughput=139866.80 # 190 % increase perf top on client: 56.86% [kernel] [k] __inet6_check_established 17.96% [kernel] [k] __inet_hash_connect 13.88% [kernel] [k] inet6_ehashfn 2.52% [kernel] [k] rcu_all_qs 2.01% [kernel] [k] __cond_resched 0.41% [kernel] [k] _raw_spin_lock Signed-off-by: Eric Dumazet <edumazet@google.com> --- include/net/inet_hashtables.h | 3 +- net/ipv4/inet_hashtables.c | 52 +++++++++++++++++++++++------------ net/ipv6/inet6_hashtables.c | 24 ++++++++-------- 3 files changed, 50 insertions(+), 29 deletions(-)