Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save majek/13848c050a3dc218ed295364ee717879 to your computer and use it in GitHub Desktop.
Save majek/13848c050a3dc218ed295364ee717879 to your computer and use it in GitHub Desktop.
0001-RTAX_INITRWND-should-be-able-to-bring-the-rcv_ssthresh.patch
commit 400854c8da31a5798abc871c78dcf6d41089abe1
Author: Marek Majkowski <marek@cloudflare.com>
Date: Mon Jul 18 13:08:03 2022 +0200
RTAX_INITRWND should be able to set the rcv_ssthresh above 64KiB
There are three places where we initialize sockets:
- tcp_output:tcp_connect_init
- tcp_minisocks:tcp_openreq_init_rwin
- syncookies
In the first two we already have a call to `tcp_rwnd_init_bpf` and
`dst_metric(RTAX_INITRWND)` which retrieve the bpf/path initrwnd
attribute. We use this value to bring `rcv_ssthresh` up, potentially
above the traditional 64KiB.
With higher initial `rcv_ssthresh` the receiver will open the receive
window more aggresively, which can improve large BDP flows - large
throughput and latency.
This patch does not cover the syncookies case.
Signed-off-by: Marek Majkowski <marek@cloudflare.com>
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8e48dc56837b..16aaf1c6b253 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -449,7 +449,8 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(const struct sock *sk,
struct request_sock *req,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ const struct dst_entry *dst);
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 228d36692d08..1f84c9fea026 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1501,7 +1501,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- newsk = tcp_create_openreq_child(sk, req, skb);
+ newsk = tcp_create_openreq_child(sk, req, skb, dst);
if (!newsk)
goto exit_nonewsk;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6854bb1fb32b..38360ad93baf 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -359,7 +359,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
int full_space = tcp_full_space(sk_listener);
u32 window_clamp;
__u8 rcv_wscale;
- u32 rcv_wnd;
+ int adj_mss;
int mss;
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -372,20 +372,16 @@ void tcp_openreq_init_rwin(struct request_sock *req,
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
req->rsk_window_clamp = full_space;
- rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
- if (rcv_wnd == 0)
- rcv_wnd = dst_metric(dst, RTAX_INITRWND);
- else if (full_space < rcv_wnd * mss)
- full_space = rcv_wnd * mss;
+ adj_mss = mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0);
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(sk_listener, full_space,
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ adj_mss,
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
- rcv_wnd);
+ 0);
ireq->rcv_wscale = rcv_wscale;
}
EXPORT_SYMBOL(tcp_openreq_init_rwin);
@@ -448,7 +444,8 @@ static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
*/
struct sock *tcp_create_openreq_child(const struct sock *sk,
struct request_sock *req,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ const struct dst_entry *dst)
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
const struct inet_request_sock *ireq = inet_rsk(req);
@@ -499,10 +496,19 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
inet_csk_reset_keepalive_timer(newsk,
keepalive_time_when(newtp));
+ u32 adj_mss = req->mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
+ u32 rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
+ if (rcv_wnd == 0)
+ rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
newtp->rx_opt.sack_ok = ireq->sack_ok;
newtp->window_clamp = req->rsk_window_clamp;
- newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ if (rcv_wnd) {
+ req->rsk_rcv_wnd = min(req->rsk_rcv_wnd, rcv_wnd * adj_mss);
+ }
+ newtp->rcv_ssthresh = max(req->rsk_rcv_wnd, rcv_wnd * adj_mss);
newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 18c913a2347a..0f2d4174ea59 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3642,6 +3642,7 @@ static void tcp_connect_init(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
u32 rcv_wnd;
+ u32 mss;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3679,8 +3680,10 @@ static void tcp_connect_init(struct sock *sk)
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+ mss = tp->advmss - (tp->rx_opt.ts_recent_stamp ?
+ tp->tcp_header_len - sizeof(struct tcphdr) : 0);
tcp_select_initial_window(sk, tcp_full_space(sk),
- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ mss,
&tp->rcv_wnd,
&tp->window_clamp,
sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
@@ -3688,7 +3691,10 @@ static void tcp_connect_init(struct sock *sk)
rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
- tp->rcv_ssthresh = tp->rcv_wnd;
+ if (rcv_wnd)
+ tp->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * mss);
+ else
+ tp->rcv_ssthresh = tp->rcv_wnd;
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 70d4890d8d2f..b66dc0efcfd1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1267,7 +1267,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
goto out;
}
- newsk = tcp_create_openreq_child(sk, req, skb);
+ newsk = tcp_create_openreq_child(sk, req, skb, dst);
if (!newsk)
goto out_nonewsk;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment