@@ -294,6 +294,9 @@ struct tcp_sock {
u32 snd_up; /* Urgent pointer */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 delivered_ce; /* Like the above but only ECE marked packets */
+ u32 received_ce; /* Like the above but for received CE marked packets */
+ u8 received_ce_pending:4, /* Not yet transmitted cnt of received_ce */
+ unused2:4;
u32 app_limited; /* limited until "delivered" reaches this val */
u32 rcv_wnd; /* Current receiver window */
/*
@@ -413,6 +413,11 @@ static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode)
tp->ecn_flags |= mode;
}
+static inline u8 tcp_accecn_ace(const struct tcphdr *th)
+{
+ return (th->ae << 2) | (th->cwr << 1) | th->ece;
+}
+
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
TCP_TW_RST = 1,
@@ -938,6 +943,20 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
+#define TCP_ACCECN_CEP_ACE_MASK 0x7
+#define TCP_ACCECN_ACE_MAX_DELTA 6
+
+/* To avoid/detect middlebox interference, not all counters start at 0.
+ * See draft-ietf-tcpm-accurate-ecn for the latest values.
+ */
+#define TCP_ACCECN_CEP_INIT_OFFSET 5
+
+static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
+{
+ tp->received_ce = 0;
+ tp->received_ce_pending = 0;
+}
+
/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */
@@ -1743,11 +1762,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
+ u32 ace;
+
/* mptcp hooks are only on the slow path */
if (sk_is_mptcp((struct sock *)tp))
return;
+ ace = tcp_ecn_mode_accecn(tp) ?
+ ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) &
+ TCP_ACCECN_CEP_ACE_MASK) : 0;
+
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+ (ace << 22) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
@@ -3336,6 +3336,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
+ tcp_accecn_init_counters(tp);
if (icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -5025,6 +5026,7 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
@@ -5032,7 +5034,7 @@ static void __init tcp_struct_check(void)
/* 32bit arches with 8byte alignment on u64 fields might need padding
* before tcp_clock_cache.
*/
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 97 + 7);
/* RX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
@@ -334,14 +334,17 @@ static bool tcp_in_quickack_mode(struct sock *sk)
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
+ /* Do not set CWR if in AccECN mode! */
if (tcp_ecn_mode_rfc3168(tp))
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
{
- if (tcp_hdr(skb)->cwr) {
- tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
/* If the sender is telling us it has entered CWR, then its
* cwnd may be very low (even just 1 packet), so we should ACK
@@ -377,17 +380,16 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
- if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+ tcp_ecn_mode_rfc3168(tp)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
- tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
- tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
@@ -421,10 +423,62 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
{
tp->delivered += delivered;
- if (ece_ack)
+ if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
tcp_count_delivered_ce(tp, delivered);
}
+/* Returns the ECN CE delta */
+static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta, safe_delta;
+ u32 corrected_ace;
+
+ /* Reordered ACK? (...or uncertain due to lack of data to send and ts) */
+ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
+ return 0;
+
+ if (!(flag & FLAG_SLOWPATH)) {
+ /* AccECN counter might overflow on large ACKs */
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return 0;
+ }
+
+ /* ACE field is not available during handshake */
+ if (flag & FLAG_SYN_ACKED)
+ return 0;
+
+ if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+
+ corrected_ace = tcp_accecn_ace(tcp_hdr(skb)) - TCP_ACCECN_CEP_INIT_OFFSET;
+ delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return delta;
+
+ safe_delta = delivered_pkts - ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
+
+ return safe_delta;
+}
+
+static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, int *flag)
+{
+ u32 delta;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag);
+ if (delta > 0) {
+ tcp_count_delivered_ce(tp, delta);
+ *flag |= FLAG_ECE;
+ /* Recalculate header predictor */
+ if (tp->pred_flags)
+ tcp_fast_path_on(tp);
+ }
+ return delta;
+}
+
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
@@ -3912,7 +3966,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
}
/* Returns the number of packets newly acked or sacked by the current ACK */
-static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
+ u32 ecn_count, int flag)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -3920,8 +3975,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
- if (flag & FLAG_ECE)
- NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+
+ if (flag & FLAG_ECE) {
+ if (tcp_ecn_mode_rfc3168(tp))
+ ecn_count = delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
+ }
return delivered;
}
@@ -3942,6 +4001,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
u32 prior_fack;
sack_state.first_sackt = 0;
@@ -4049,6 +4109,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, &flag);
+
tcp_in_ack_event(sk, flag);
if (tp->tlp_high_seq)
@@ -4073,7 +4136,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tcp_newly_delivered(sk, delivered, flag);
+ delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag);
+
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -4082,12 +4146,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, &flag);
tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -4108,7 +4174,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -5940,6 +6006,24 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
}
}
+/* Updates Accurate ECN received counters from the received IP ECN field */
+static void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb)
+{
+ u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
+ u8 is_ce = INET_ECN_is_ce(ecnfield);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!INET_ECN_is_not_ect(ecnfield)) {
+ u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+
+ tp->ecn_flags |= TCP_ECN_SEEN;
+
+ /* ACE counter tracks *all* segments including pure ACKs */
+ tp->received_ce += pcount;
+ tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU);
+ }
+}
+
/* Accept RST for rcv_nxt - 1 after a FIN.
* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
* FIN is sent followed by a RST packet. The RST is sent with the same
@@ -6188,6 +6272,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
tp->rcv_nxt == tp->rcv_wup)
flag |= __tcp_replace_ts_recent(tp, tstamp_delta);
+ tcp_ecn_received_counters(sk, skb);
+
/* We know that such packets are checksummed
* on entry.
*/
@@ -6231,6 +6317,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
/* Bulk data transfer: receiver */
skb_dst_drop(skb);
__skb_pull(skb, tcp_header_len);
+ tcp_ecn_received_counters(sk, skb);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
@@ -6271,6 +6358,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
return;
step5:
+ tcp_ecn_received_counters(sk, skb);
+
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
if ((int)reason < 0) {
reason = -reason;
@@ -371,6 +371,17 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
th->ece = 1;
}
+static void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp)
+{
+ u32 wire_ace;
+
+ wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
+ th->ece = !!(wire_ace & 0x1);
+ th->cwr = !!(wire_ace & 0x2);
+ th->ae = !!(wire_ace & 0x4);
+ tp->received_ce_pending = 0;
+}
+
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
@@ -379,11 +390,17 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_ecn_mode_rfc3168(tp)) {
+ if (!tcp_ecn_mode_any(tp))
+ return;
+
+ INET_ECN_xmit(sk);
+ if (tcp_ecn_mode_accecn(tp)) {
+ tcp_accecn_set_ace(th, tp);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
+ } else {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
- INET_ECN_xmit(sk);
if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
th->cwr = 1;