Created
July 27, 2022 11:15
-
-
Save majek/13848c050a3dc218ed295364ee717879 to your computer and use it in GitHub Desktop.
0001-RTAX_INITRWND-should-be-able-to-bring-the-rcv_ssthresh.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit 400854c8da31a5798abc871c78dcf6d41089abe1 | |
Author: Marek Majkowski <marek@cloudflare.com> | |
Date: Mon Jul 18 13:08:03 2022 +0200 | |
RTAX_INITRWND should be able to set the rcv_ssthresh above 64KiB | |
There are three places where we initialize sockets: | |
- tcp_output:tcp_connect_init | |
- tcp_minisocks:tcp_openreq_init_rwin | |
- syncookies | |
In the first two we already have a call to `tcp_rwnd_init_bpf` and | |
`dst_metric(RTAX_INITRWND)` which retrieve the bpf/path initrwnd | |
attribute. We use this value to bring `rcv_ssthresh` up, potentially | |
above the traditional 64KiB. | |
With higher initial `rcv_ssthresh` the receiver will open the receive | |
window more aggresively, which can improve large BDP flows - large | |
throughput and latency. | |
This patch does not cover the syncookies case. | |
Signed-off-by: Marek Majkowski <marek@cloudflare.com> | |
diff --git a/include/net/tcp.h b/include/net/tcp.h | |
index 8e48dc56837b..16aaf1c6b253 100644 | |
--- a/include/net/tcp.h | |
+++ b/include/net/tcp.h | |
@@ -449,7 +449,8 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq); | |
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); | |
struct sock *tcp_create_openreq_child(const struct sock *sk, | |
struct request_sock *req, | |
- struct sk_buff *skb); | |
+ struct sk_buff *skb, | |
+ const struct dst_entry *dst); | |
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); | |
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, | |
struct request_sock *req, | |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c | |
index 228d36692d08..1f84c9fea026 100644 | |
--- a/net/ipv4/tcp_ipv4.c | |
+++ b/net/ipv4/tcp_ipv4.c | |
@@ -1501,7 +1501,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, | |
if (sk_acceptq_is_full(sk)) | |
goto exit_overflow; | |
- newsk = tcp_create_openreq_child(sk, req, skb); | |
+ newsk = tcp_create_openreq_child(sk, req, skb, dst); | |
if (!newsk) | |
goto exit_nonewsk; | |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c | |
index 6854bb1fb32b..38360ad93baf 100644 | |
--- a/net/ipv4/tcp_minisocks.c | |
+++ b/net/ipv4/tcp_minisocks.c | |
@@ -359,7 +359,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, | |
int full_space = tcp_full_space(sk_listener); | |
u32 window_clamp; | |
__u8 rcv_wscale; | |
- u32 rcv_wnd; | |
+ int adj_mss; | |
int mss; | |
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); | |
@@ -372,20 +372,16 @@ void tcp_openreq_init_rwin(struct request_sock *req, | |
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) | |
req->rsk_window_clamp = full_space; | |
- rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); | |
- if (rcv_wnd == 0) | |
- rcv_wnd = dst_metric(dst, RTAX_INITRWND); | |
- else if (full_space < rcv_wnd * mss) | |
- full_space = rcv_wnd * mss; | |
+ adj_mss = mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0); | |
/* tcp_full_space because it is guaranteed to be the first packet */ | |
tcp_select_initial_window(sk_listener, full_space, | |
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | |
+ adj_mss, | |
&req->rsk_rcv_wnd, | |
&req->rsk_window_clamp, | |
ireq->wscale_ok, | |
&rcv_wscale, | |
- rcv_wnd); | |
+ 0); | |
ireq->rcv_wscale = rcv_wscale; | |
} | |
EXPORT_SYMBOL(tcp_openreq_init_rwin); | |
@@ -448,7 +444,8 @@ static void smc_check_reset_syn_req(struct tcp_sock *oldtp, | |
*/ | |
struct sock *tcp_create_openreq_child(const struct sock *sk, | |
struct request_sock *req, | |
- struct sk_buff *skb) | |
+ struct sk_buff *skb, | |
+ const struct dst_entry *dst) | |
{ | |
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); | |
const struct inet_request_sock *ireq = inet_rsk(req); | |
@@ -499,10 +496,19 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, | |
inet_csk_reset_keepalive_timer(newsk, | |
keepalive_time_when(newtp)); | |
+ u32 adj_mss = req->mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0); | |
+ | |
+ u32 rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); | |
+ if (rcv_wnd == 0) | |
+ rcv_wnd = dst_metric(dst, RTAX_INITRWND); | |
+ | |
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; | |
newtp->rx_opt.sack_ok = ireq->sack_ok; | |
newtp->window_clamp = req->rsk_window_clamp; | |
- newtp->rcv_ssthresh = req->rsk_rcv_wnd; | |
+ if (rcv_wnd) { | |
+ req->rsk_rcv_wnd = min(req->rsk_rcv_wnd, rcv_wnd * adj_mss); | |
+ } | |
+ newtp->rcv_ssthresh = max(req->rsk_rcv_wnd, rcv_wnd * adj_mss); | |
newtp->rcv_wnd = req->rsk_rcv_wnd; | |
newtp->rx_opt.wscale_ok = ireq->wscale_ok; | |
if (newtp->rx_opt.wscale_ok) { | |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c | |
index 18c913a2347a..0f2d4174ea59 100644 | |
--- a/net/ipv4/tcp_output.c | |
+++ b/net/ipv4/tcp_output.c | |
@@ -3642,6 +3642,7 @@ static void tcp_connect_init(struct sock *sk) | |
struct tcp_sock *tp = tcp_sk(sk); | |
__u8 rcv_wscale; | |
u32 rcv_wnd; | |
+ u32 mss; | |
/* We'll fix this up when we get a response from the other end. | |
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. | |
@@ -3679,8 +3680,10 @@ static void tcp_connect_init(struct sock *sk) | |
if (rcv_wnd == 0) | |
rcv_wnd = dst_metric(dst, RTAX_INITRWND); | |
+ mss = tp->advmss - (tp->rx_opt.ts_recent_stamp ? | |
+ tp->tcp_header_len - sizeof(struct tcphdr) : 0); | |
tcp_select_initial_window(sk, tcp_full_space(sk), | |
- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), | |
+ mss, | |
&tp->rcv_wnd, | |
&tp->window_clamp, | |
sock_net(sk)->ipv4.sysctl_tcp_window_scaling, | |
@@ -3688,7 +3691,10 @@ static void tcp_connect_init(struct sock *sk) | |
rcv_wnd); | |
tp->rx_opt.rcv_wscale = rcv_wscale; | |
- tp->rcv_ssthresh = tp->rcv_wnd; | |
+ if (rcv_wnd) | |
+ tp->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * mss); | |
+ else | |
+ tp->rcv_ssthresh = tp->rcv_wnd; | |
sk->sk_err = 0; | |
sock_reset_flag(sk, SOCK_DONE); | |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c | |
index 70d4890d8d2f..b66dc0efcfd1 100644 | |
--- a/net/ipv6/tcp_ipv6.c | |
+++ b/net/ipv6/tcp_ipv6.c | |
@@ -1267,7 +1267,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * | |
goto out; | |
} | |
- newsk = tcp_create_openreq_child(sk, req, skb); | |
+ newsk = tcp_create_openreq_child(sk, req, skb, dst); | |
if (!newsk) | |
goto out_nonewsk; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment