Created
January 10, 2013 12:53
-
-
Save hnakagawa/4501843 to your computer and use it in GitHub Desktop.
ERを3.3にバックポートしてみた。※ERの動作は未確認
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 810e88dced5511f1785e8b227c9f95e6d74bd152 Mon Sep 17 00:00:00 2001 | |
From: Hirofumi Nakagawa <hirofumi.nakagawa@mixi.co.jp> | |
Date: Fri, 11 Jan 2013 06:43:37 +0900 | |
Subject: [PATCH] backported early retransmit to v3.3 | |
MIME-Version: 1.0 | |
Content-Type: text/plain; charset=UTF-8 | |
Content-Transfer-Encoding: 8bit | |
commit eed530b6c67624db3f2cf477bac7c4d005d8f7ba | |
Author: Yuchung Cheng <ycheng@google.com> | |
Date: Wed May 2 13:30:03 2012 +0000 | |
tcp: early retransmit | |
This patch implements RFC 5827 early retransmit (ER) for TCP. | |
It reduces DUPACK threshold (dupthresh) if outstanding packets are | |
less than 4 to recover losses by fast recovery instead of timeout. | |
While the algorithm is simple, small but frequent network reordering | |
makes this feature dangerous: the connection repeatedly enter | |
false recovery and degrade performance. Therefore we implement | |
a mitigation suggested in the appendix of the RFC that delays | |
entering fast recovery by a small interval, i.e., RTT/4. Currently | |
ER is conservative and is disabled for the rest of the connection | |
after the first reordering event. A large scale web server | |
experiment on the performance impact of ER is summarized in | |
section 6 of the paper "Proportional Rate Reduction for TCP”, | |
IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf | |
Note that Linux has a similar feature called THIN_DUPACK. The | |
differences are THIN_DUPACK do not mitigate reorderings and is only | |
used after slow start. Currently ER is disabled if THIN_DUPACK is | |
enabled. I would be happy to merge THIN_DUPACK feature with ER if | |
people think it's a good idea. | |
ER is enabled by sysctl_tcp_early_retrans: | |
0: Disables ER | |
1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4. | |
2: (Default) reduce dupthresh like mode 1. In addition, delay | |
entering fast recovery by RTT/4. | |
Note: mode 2 is implemented in the third part of this patch series. | |
Signed-off-by: Yuchung Cheng <ycheng@google.com> | |
Acked-by: Neal Cardwell <ncardwell@google.com> | |
Signed-off-by: David S. Miller <davem@davemloft.net> | |
--- | |
Documentation/networking/ip-sysctl.txt | 14 ++++++++++++++ | |
include/linux/tcp.h | 2 ++ | |
include/net/tcp.h | 15 +++++++++++++++ | |
net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ | |
net/ipv4/tcp.c | 2 ++ | |
net/ipv4/tcp_input.c | 15 +++++++++++++++ | |
net/ipv4/tcp_ipv4.c | 1 + | |
net/ipv4/tcp_minisocks.c | 1 + | |
8 files changed, 60 insertions(+), 0 deletions(-) | |
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt | |
index ad3e80e..bffb96d 100644 | |
--- a/Documentation/networking/ip-sysctl.txt | |
+++ b/Documentation/networking/ip-sysctl.txt | |
@@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER | |
tcp_dsack - BOOLEAN | |
Allows TCP to send "duplicate" SACKs. | |
+tcp_early_retrans - INTEGER | |
+ Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold | |
+ for triggering fast retransmit when the amount of outstanding data is | |
+ small and when no previously unsent data can be transmitted (such | |
+ that limited transmit could be used). | |
+ Possible values: | |
+ 0 disables ER | |
+ 1 enables ER | |
+ 2 enables ER but delays fast recovery and fast retransmit | |
+ by a fourth of RTT. This mitigates connection falsely | |
+ recovers when network has a small degree of reordering | |
+ (less than 3 packets). | |
+ Default: 2 | |
+ | |
tcp_ecn - INTEGER | |
Enable Explicit Congestion Notification (ECN) in TCP. ECN is only | |
used when both ends of the TCP flow support it. It is useful to | |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h | |
index 3c7ffdb..cc616c8 100644 | |
--- a/include/linux/tcp.h | |
+++ b/include/linux/tcp.h | |
@@ -355,6 +355,8 @@ struct tcp_sock { | |
thin_dupack : 1,/* Fast retransmit on first dupack */ | |
unused : 2; | |
+ u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */ | |
+ | |
/* RTT measurement */ | |
u32 srtt; /* smoothed round trip time << 3 */ | |
u32 mdev; /* medium deviation */ | |
diff --git a/include/net/tcp.h b/include/net/tcp.h | |
index 2d80c29..97de8b6 100644 | |
--- a/include/net/tcp.h | |
+++ b/include/net/tcp.h | |
@@ -251,6 +251,7 @@ extern int sysctl_tcp_max_ssthresh; | |
extern int sysctl_tcp_cookie_size; | |
extern int sysctl_tcp_thin_linear_timeouts; | |
extern int sysctl_tcp_thin_dupack; | |
+extern int sysctl_tcp_early_retrans; | |
extern atomic_long_t tcp_memory_allocated; | |
extern struct percpu_counter tcp_sockets_allocated; | |
@@ -789,6 +790,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) | |
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; | |
} | |
+/* TCP early-retransmit (ER) is similar to but more conservative than | |
+ * the thin-dupack feature. Enable ER only if thin-dupack is disabled. | |
+ */ | |
+static inline void tcp_enable_early_retrans(struct tcp_sock *tp) | |
+{ | |
+ tp->do_early_retrans = sysctl_tcp_early_retrans && | |
+ !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3; | |
+} | |
+ | |
+static inline void tcp_disable_early_retrans(struct tcp_sock *tp) | |
+{ | |
+ tp->do_early_retrans = 0; | |
+} | |
+ | |
static inline unsigned int tcp_left_out(const struct tcp_sock *tp) | |
{ | |
return tp->sacked_out + tp->lost_out; | |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c | |
index 7a7724d..675d29b 100644 | |
--- a/net/ipv4/sysctl_net_ipv4.c | |
+++ b/net/ipv4/sysctl_net_ipv4.c | |
@@ -27,6 +27,7 @@ | |
#include <net/tcp_memcontrol.h> | |
static int zero; | |
+static int two = 2; | |
static int tcp_retr1_max = 255; | |
static int ip_local_port_range_min[] = { 1, 1 }; | |
static int ip_local_port_range_max[] = { 65535, 65535 }; | |
@@ -677,6 +678,15 @@ static struct ctl_table ipv4_table[] = { | |
.proc_handler = proc_dointvec | |
}, | |
{ | |
+ .procname = "tcp_early_retrans", | |
+ .data = &sysctl_tcp_early_retrans, | |
+ .maxlen = sizeof(int), | |
+ .mode = 0644, | |
+ .proc_handler = proc_dointvec_minmax, | |
+ .extra1 = &zero, | |
+ .extra2 = &two, | |
+ }, | |
+ { | |
.procname = "udp_mem", | |
.data = &sysctl_udp_mem, | |
.maxlen = sizeof(sysctl_udp_mem), | |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c | |
index 22ef5f9..d097085 100644 | |
--- a/net/ipv4/tcp.c | |
+++ b/net/ipv4/tcp.c | |
@@ -2292,6 +2292,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |
err = -EINVAL; | |
else | |
tp->thin_dupack = val; | |
+ if (tp->thin_dupack) | |
+ tcp_disable_early_retrans(tp); | |
break; | |
case TCP_CORK: | |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c | |
index b5e315f..6ae6c09 100644 | |
--- a/net/ipv4/tcp_input.c | |
+++ b/net/ipv4/tcp_input.c | |
@@ -97,6 +97,7 @@ int sysctl_tcp_thin_dupack __read_mostly; | |
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | |
int sysctl_tcp_abc __read_mostly; | |
+int sysctl_tcp_early_retrans __read_mostly = 2; | |
#define FLAG_DATA 0x01 /* Incoming frame contained data. */ | |
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | |
@@ -900,6 +901,7 @@ static void tcp_init_metrics(struct sock *sk) | |
if (dst_metric(dst, RTAX_REORDERING) && | |
tp->reordering != dst_metric(dst, RTAX_REORDERING)) { | |
tcp_disable_fack(tp); | |
+ tcp_disable_early_retrans(tp); | |
tp->reordering = dst_metric(dst, RTAX_REORDERING); | |
} | |
@@ -982,6 +984,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, | |
#endif | |
tcp_disable_fack(tp); | |
} | |
+ | |
+ if (metric > 0) | |
+ tcp_disable_early_retrans(tp); | |
} | |
/* This must be called before lost_out is incremented */ | |
@@ -2486,6 +2491,16 @@ static int tcp_time_to_recover(struct sock *sk) | |
tcp_is_sack(tp) && !tcp_send_head(sk)) | |
return 1; | |
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious | |
+ * retransmissions due to small network reorderings, we implement | |
+ * Mitigation A.3 in the RFC and delay the retransmission for a short | |
+ * interval if appropriate. | |
+ */ | |
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && | |
+ (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && | |
+ !tcp_may_send_now(sk)) | |
+ return 1; | |
+ | |
return 0; | |
} | |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c | |
index fd54c5f..838f65b 100644 | |
--- a/net/ipv4/tcp_ipv4.c | |
+++ b/net/ipv4/tcp_ipv4.c | |
@@ -1897,6 +1897,7 @@ static int tcp_v4_init_sock(struct sock *sk) | |
tp->mss_cache = TCP_MSS_DEFAULT; | |
tp->reordering = sysctl_tcp_reordering; | |
+ tcp_enable_early_retrans(tp); | |
icsk->icsk_ca_ops = &tcp_init_congestion_ops; | |
sk->sk_state = TCP_CLOSE; | |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c | |
index 550e755..a311da7 100644 | |
--- a/net/ipv4/tcp_minisocks.c | |
+++ b/net/ipv4/tcp_minisocks.c | |
@@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |
newtp->sacked_out = 0; | |
newtp->fackets_out = 0; | |
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | |
+ tcp_enable_early_retrans(newtp); | |
/* So many TCP implementations out there (incorrectly) count the | |
* initial SYN frame in their delayed-ACK and congestion control | |
-- | |
1.7.4.4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment