Skip to content

Instantly share code, notes, and snippets.

@hnakagawa
Created January 10, 2013 12:53
Show Gist options
  • Save hnakagawa/4501843 to your computer and use it in GitHub Desktop.
Save hnakagawa/4501843 to your computer and use it in GitHub Desktop.
ERを3.3にバックポートしてみた。※ERの動作は未確認
From 810e88dced5511f1785e8b227c9f95e6d74bd152 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakagawa <hirofumi.nakagawa@mixi.co.jp>
Date: Fri, 11 Jan 2013 06:43:37 +0900
Subject: [PATCH] backported early retransmit to v3.3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit eed530b6c67624db3f2cf477bac7c4d005d8f7ba
Author: Yuchung Cheng <ycheng@google.com>
Date: Wed May 2 13:30:03 2012 +0000
tcp: early retransmit
This patch implements RFC 5827 early retransmit (ER) for TCP.
It reduces DUPACK threshold (dupthresh) if outstanding packets are
less than 4 to recover losses by fast recovery instead of timeout.
While the algorithm is simple, small but frequent network reordering
makes this feature dangerous: the connection repeatedly enter
false recovery and degrade performance. Therefore we implement
a mitigation suggested in the appendix of the RFC that delays
entering fast recovery by a small interval, i.e., RTT/4. Currently
ER is conservative and is disabled for the rest of the connection
after the first reordering event. A large scale web server
experiment on the performance impact of ER is summarized in
section 6 of the paper "Proportional Rate Reduction for TCP”,
IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf
Note that Linux has a similar feature called THIN_DUPACK. The
differences are THIN_DUPACK do not mitigate reorderings and is only
used after slow start. Currently ER is disabled if THIN_DUPACK is
enabled. I would be happy to merge THIN_DUPACK feature with ER if
people think it's a good idea.
ER is enabled by sysctl_tcp_early_retrans:
0: Disables ER
1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4.
2: (Default) reduce dupthresh like mode 1. In addition, delay
entering fast recovery by RTT/4.
Note: mode 2 is implemented in the third part of this patch series.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
Documentation/networking/ip-sysctl.txt | 14 ++++++++++++++
include/linux/tcp.h | 2 ++
include/net/tcp.h | 15 +++++++++++++++
net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
net/ipv4/tcp.c | 2 ++
net/ipv4/tcp_input.c | 15 +++++++++++++++
net/ipv4/tcp_ipv4.c | 1 +
net/ipv4/tcp_minisocks.c | 1 +
8 files changed, 60 insertions(+), 0 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ad3e80e..bffb96d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER
tcp_dsack - BOOLEAN
Allows TCP to send "duplicate" SACKs.
+tcp_early_retrans - INTEGER
+ Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
+ for triggering fast retransmit when the amount of outstanding data is
+ small and when no previously unsent data can be transmitted (such
+ that limited transmit could be used).
+ Possible values:
+ 0 disables ER
+ 1 enables ER
+ 2 enables ER but delays fast recovery and fast retransmit
+ by a fourth of RTT. This mitigates connection falsely
+ recovers when network has a small degree of reordering
+ (less than 3 packets).
+ Default: 2
+
tcp_ecn - INTEGER
Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
used when both ends of the TCP flow support it. It is useful to
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3c7ffdb..cc616c8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -355,6 +355,8 @@ struct tcp_sock {
thin_dupack : 1,/* Fast retransmit on first dupack */
unused : 2;
+ u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */
+
/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
u32 mdev; /* medium deviation */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2d80c29..97de8b6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -251,6 +251,7 @@ extern int sysctl_tcp_max_ssthresh;
extern int sysctl_tcp_cookie_size;
extern int sysctl_tcp_thin_linear_timeouts;
extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_early_retrans;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -789,6 +790,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
}
+/* TCP early-retransmit (ER) is similar to but more conservative than
+ * the thin-dupack feature. Enable ER only if thin-dupack is disabled.
+ */
+static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
+{
+ tp->do_early_retrans = sysctl_tcp_early_retrans &&
+ !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
+}
+
+static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
+{
+ tp->do_early_retrans = 0;
+}
+
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7a7724d..675d29b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@
#include <net/tcp_memcontrol.h>
static int zero;
+static int two = 2;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -677,6 +678,15 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_early_retrans",
+ .data = &sysctl_tcp_early_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 22ef5f9..d097085 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2292,6 +2292,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
else
tp->thin_dupack = val;
+ if (tp->thin_dupack)
+ tcp_disable_early_retrans(tp);
break;
case TCP_CORK:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b5e315f..6ae6c09 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -97,6 +97,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -900,6 +901,7 @@ static void tcp_init_metrics(struct sock *sk)
if (dst_metric(dst, RTAX_REORDERING) &&
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
tp->reordering = dst_metric(dst, RTAX_REORDERING);
}
@@ -982,6 +984,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
#endif
tcp_disable_fack(tp);
}
+
+ if (metric > 0)
+ tcp_disable_early_retrans(tp);
}
/* This must be called before lost_out is incremented */
@@ -2486,6 +2491,16 @@ static int tcp_time_to_recover(struct sock *sk)
tcp_is_sack(tp) && !tcp_send_head(sk))
return 1;
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
+ * retransmissions due to small network reorderings, we implement
+ * Mitigation A.3 in the RFC and delay the retransmission for a short
+ * interval if appropriate.
+ */
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+ (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ !tcp_may_send_now(sk))
+ return 1;
+
return 0;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fd54c5f..838f65b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1897,6 +1897,7 @@ static int tcp_v4_init_sock(struct sock *sk)
tp->mss_cache = TCP_MSS_DEFAULT;
tp->reordering = sysctl_tcp_reordering;
+ tcp_enable_early_retrans(tp);
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 550e755..a311da7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_enable_early_retrans(newtp);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
--
1.7.4.4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment