Skip to content

Commit d8cf7ce

Browse files
committed
tcp: add the ability to control max RTO
JIRA: https://issues.redhat.com/browse/RHEL-115191 Upstream Status: linux.git Conflicts:\ - Context differences due to missing upstream commits 54b771e ("doc: net: Fix .rst rendering of net_cachelines pages"), 58169ec ("inet: preserve const qualifier in inet_csk()"), f086ede ("tcp: add sysctl_tcp_rto_min_us") and ccce324 ("tcp: make the first N SYN RTO backoffs linear") in c9s. - Context differences due to socket options not being in c9s (TCP AO & MPTCP). commit 54a378f Author: Eric Dumazet <edumazet@google.com> Date: Fri Feb 7 15:28:29 2025 +0000 tcp: add the ability to control max RTO Currently, TCP stack uses a constant (120 seconds) to limit the RTO value exponential growth. Some applications want to set a lower value. Add TCP_RTO_MAX_MS socket option to set a value (in ms) between 1 and 120 seconds. It is discouraged to change the socket rto max on a live socket, as it might lead to unexpected disconnects. Following patch is adding a netns sysctl to control the default value at socket creation time. Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Jason Xing <kerneljasonxing@gmail.com> Reviewed-by: Neal Cardwell <ncardwell@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Antoine Tenart <atenart@redhat.com>
1 parent b8f6fe6 commit d8cf7ce

File tree

9 files changed

+40
-17
lines changed

9 files changed

+40
-17
lines changed

Documentation/networking/net_cachelines/inet_connection_sock.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ struct_timer_list icsk_retransmit_timer read_mostly -
1616
struct_timer_list icsk_delack_timer read_mostly - inet_csk_reset_xmit_timer,tcp_connect
1717
u32 icsk_rto read_write - tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one
1818
u32 icsk_rto_min - -
19+
u32 icsk_rto_max read_mostly - tcp_reset_xmit_timer
1920
u32 icsk_delack_max - -
2021
u32 icsk_pmtu_cookie read_write - tcp_sync_mss,tcp_current_mss,tcp_send_syn_data,tcp_connect_init,tcp_connect
2122
struct_tcp_congestion_ops icsk_ca_ops read_write - tcp_cwnd_validate,tcp_tso_segs,tcp_ca_dst_init,tcp_connect_init,tcp_connect,tcp_write_xmit

include/net/inet_connection_sock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ struct inet_connection_sock {
8888
struct timer_list icsk_delack_timer;
8989
__u32 icsk_rto;
9090
__u32 icsk_rto_min;
91+
u32 icsk_rto_max;
9192
__u32 icsk_delack_max;
9293
__u32 icsk_pmtu_cookie;
9394
const struct tcp_congestion_ops *icsk_ca_ops;

include/net/tcp.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
141141
#define TCP_DELACK_MIN 4U
142142
#define TCP_ATO_MIN 4U
143143
#endif
144-
#define TCP_RTO_MAX ((unsigned)(120*HZ))
145-
#define TCP_RTO_MIN ((unsigned)(HZ/5))
144+
#define TCP_RTO_MAX_SEC 120
145+
#define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ))
146+
#define TCP_RTO_MIN ((unsigned)(HZ / 5))
146147
#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
147148

148149
#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */
@@ -711,10 +712,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu);
711712
int tcp_mss_to_mtu(struct sock *sk, int mss);
712713
void tcp_mtup_init(struct sock *sk);
713714

715+
static inline unsigned int tcp_rto_max(const struct sock *sk)
716+
{
717+
return READ_ONCE(inet_csk(sk)->icsk_rto_max);
718+
}
719+
714720
static inline void tcp_bound_rto(const struct sock *sk)
715721
{
716-
if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
717-
inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
722+
inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk));
718723
}
719724

720725
static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
@@ -1359,7 +1364,8 @@ static inline void tcp_reset_xmit_timer(struct sock *sk,
13591364
{
13601365
if (pace_delay)
13611366
when += tcp_pacing_delay(sk);
1362-
inet_csk_reset_xmit_timer(sk, what, when, TCP_RTO_MAX);
1367+
inet_csk_reset_xmit_timer(sk, what, when,
1368+
tcp_rto_max(sk));
13631369
}
13641370

13651371
/* Something is really bad, we could not queue an additional packet,

include/uapi/linux/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ enum {
129129

130130
#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */
131131

132+
#define TCP_RTO_MAX_MS 44 /* max rto time in ms */
132133

133134
#define TCP_REPAIR_ON 1
134135
#define TCP_REPAIR_OFF 0

net/ipv4/tcp.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,12 @@ void tcp_init_sock(struct sock *sk)
425425
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
426426

427427
icsk->icsk_rto = TCP_TIMEOUT_INIT;
428+
429+
/* Use a sysctl ? */
430+
icsk->icsk_rto_max = TCP_RTO_MAX;
431+
428432
icsk->icsk_rto_min = TCP_RTO_MIN;
433+
429434
icsk->icsk_delack_max = TCP_DELACK_MAX;
430435
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
431436
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
@@ -3669,6 +3674,11 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
36693674
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
36703675
TCP_RTO_MAX / HZ));
36713676
return 0;
3677+
case TCP_RTO_MAX_MS:
3678+
if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC)
3679+
return -EINVAL;
3680+
WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val));
3681+
return 0;
36723682
}
36733683

36743684
sockopt_lock_sock(sk);
@@ -4429,6 +4439,9 @@ int do_tcp_getsockopt(struct sock *sk, int level,
44294439
return err;
44304440
}
44314441
#endif
4442+
case TCP_RTO_MAX_MS:
4443+
val = jiffies_to_msecs(tcp_rto_max(sk));
4444+
break;
44324445
default:
44334446
return -ENOPROTOOPT;
44344447
}

net/ipv4/tcp_input.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3508,7 +3508,7 @@ static void tcp_ack_probe(struct sock *sk)
35083508
* This function is not for random using!
35093509
*/
35103510
} else {
3511-
unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3511+
unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk));
35123512

35133513
when = tcp_clamp_probe0_to_user_timeout(sk, when);
35143514
tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true);

net/ipv4/tcp_ipv4.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439439

440440
icsk->icsk_backoff--;
441441
icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
442-
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
442+
icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
443443

444444
tcp_mstamp_refresh(tp);
445445
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));

net/ipv4/tcp_output.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4027,7 +4027,7 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
40274027
unsigned long delay;
40284028

40294029
delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
4030-
if (delay < TCP_RTO_MAX)
4030+
if (delay < tcp_rto_max(sk))
40314031
icsk->icsk_ack.retry++;
40324032
inet_csk_schedule_ack(sk);
40334033
icsk->icsk_ack.ato = TCP_ATO_MIN;
@@ -4167,7 +4167,7 @@ void tcp_send_probe0(struct sock *sk)
41674167
if (err <= 0) {
41684168
if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
41694169
icsk->icsk_backoff++;
4170-
timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
4170+
timeout = tcp_probe0_when(sk, tcp_rto_max(sk));
41714171
} else {
41724172
/* If packet was not sent due to local congestion,
41734173
* Let senders fight for local resources conservatively.

net/ipv4/tcp_timer.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
109109

110110
/* If peer does not open window for long time, or did not transmit
111111
* anything for long time, penalize it. */
112-
if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
112+
if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset)
113113
shift++;
114114

115115
/* If some dubious ICMP arrived, penalize even more. */
@@ -189,12 +189,12 @@ static unsigned int tcp_model_timeout(struct sock *sk,
189189
{
190190
unsigned int linear_backoff_thresh, timeout;
191191

192-
linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
192+
linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base);
193193
if (boundary <= linear_backoff_thresh)
194194
timeout = ((2 << boundary) - 1) * rto_base;
195195
else
196196
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
197-
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
197+
(boundary - linear_backoff_thresh) * tcp_rto_max(sk);
198198
return jiffies_to_msecs(timeout);
199199
}
200200
/**
@@ -257,7 +257,7 @@ static int tcp_write_timeout(struct sock *sk)
257257

258258
retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
259259
if (sock_flag(sk, SOCK_DEAD)) {
260-
const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
260+
const bool alive = icsk->icsk_rto < tcp_rto_max(sk);
261261

262262
retry_until = tcp_orphan_retries(sk, alive);
263263
do_reset = alive ||
@@ -395,7 +395,8 @@ static void tcp_probe_timer(struct sock *sk)
395395
}
396396
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
397397
if (sock_flag(sk, SOCK_DEAD)) {
398-
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
398+
unsigned int rto_max = tcp_rto_max(sk);
399+
const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max;
399400

400401
max_probes = tcp_orphan_retries(sk, alive);
401402
if (!alive && icsk->icsk_backoff >= max_probes)
@@ -457,7 +458,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
457458
const struct inet_connection_sock *icsk = inet_csk(sk);
458459
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
459460
const struct tcp_sock *tp = tcp_sk(sk);
460-
int timeout = TCP_RTO_MAX * 2;
461+
int timeout = tcp_rto_max(sk) * 2;
461462
u32 rtx_delta;
462463
s32 rcv_delta;
463464

@@ -628,10 +629,10 @@ void tcp_retransmit_timer(struct sock *sk)
628629
icsk->icsk_backoff = 0;
629630
icsk->icsk_rto = clamp(__tcp_set_rto(tp),
630631
tcp_rto_min(sk),
631-
TCP_RTO_MAX);
632+
tcp_rto_max(sk));
632633
} else {
633634
/* Use normal (exponential) backoff */
634-
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
635+
icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk));
635636
}
636637
tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
637638
tcp_clamp_rto_to_user_timeout(sk), false);

0 commit comments

Comments
 (0)