msm-4.14/net/ipv4/tcp_recovery.c
Neal Cardwell 5069afc313 tcp: fix excessive TLP and RACK timeouts from HZ rounding
[ Upstream commit 1c2709cfff1dedbb9591e989e2f001484208d914 ]

We discovered from packet traces of slow loss recovery on kernels with
the default HZ=250 setting (and min_rtt < 1ms) that after reordering,
when receiving a SACKed sequence range, the RACK reordering timer was
firing after about 16ms rather than the desired value of roughly
min_rtt/4 + 2ms. The problem is largely due to the RACK reorder timer
calculation adding in TCP_TIMEOUT_MIN, which is 2 jiffies. On kernels
with HZ=250, this is 2*4ms = 8ms. The TLP timer calculation has the
exact same issue.

This commit fixes the TLP transmit timer and RACK reordering timer
floor calculation to more closely match the intended 2ms floor even on
kernels with HZ=250. It does this by adding in a new
TCP_TIMEOUT_MIN_US floor of 2000 us and then converting to jiffies,
instead of the current approach of converting to jiffies and then
adding th TCP_TIMEOUT_MIN value of 2 jiffies.

Our testing has verified that on kernels with HZ=1000, as expected,
this does not produce significant changes in behavior, but on kernels
with the default HZ=250 the latency improvement can be large. For
example, our tests show that for HZ=250 kernels at low RTTs this fix
roughly halves the latency for the RACK reorder timer: instead of
mostly firing at 16ms it mostly fires at 8ms.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Fixes: bb4d991a28cc ("tcp: adjust tail loss probe timeout")
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20231015174700.2206872-1-ncardwell.sw@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
2024-05-06 14:36:40 +00:00

180 lines
5.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/tcp.h>
#include <net/tcp.h>
int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
tcp_skb_mark_lost_uncond_verify(tp, skb);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
/* Account for retransmits that are lost again */
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
tcp_skb_pcount(skb));
}
}
static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
{
return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
*
* Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics:
*
* dupthresh: 3 OOO packets delivered (packet count)
* FACK: sequence delta to highest sacked sequence (sequence space)
* RACK: sent time delta to the latest delivered packet (time domain)
*
* The advantage of RACK is it applies to both original and retransmitted
* packet and therefore is robust against tail losses. Another advantage
* is being more resilient to reordering by simply allowing some
* "settling delay", instead of tweaking the dupthresh.
*
* When tcp_rack_detect_loss() detects some packets are lost and we
* are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
* or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
* make us enter the CA_Recovery state.
*/
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
u32 reo_wnd;
*reo_timeout = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
*/
reo_wnd = 1000;
if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
tcp_for_write_queue(skb, sk) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
if (skb == tcp_send_head(sk))
break;
/* Skip ones already (s)acked */
if (!after(scb->end_seq, tp->snd_una) ||
scb->sacked & TCPCB_SACKED_ACKED)
continue;
if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
tp->rack.end_seq, scb->end_seq)) {
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
* A packet is lost if its elapsed time is beyond
* the recent RTT plus the reordering window.
*/
u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
skb->skb_mstamp);
s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
if (remaining < 0) {
tcp_rack_mark_skb_lost(sk, skb);
continue;
}
/* Skip ones marked lost but not yet retransmitted */
if ((scb->sacked & TCPCB_LOST) &&
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
/* Record maximum wait time (+1 to avoid 0) */
*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
*/
break;
}
}
}
bool tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout;
if (!tp->rack.advanced)
return false;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, &timeout);
if (timeout) {
timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
return !!timeout;
}
/* Record the most recently (re)sent time among the (s)acked packets
* This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
* draft-cheng-tcpm-rack-00.txt
*/
void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
u64 xmit_time)
{
u32 rtt_us;
if (tp->rack.mstamp &&
!tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
end_seq, tp->rack.end_seq))
return;
rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
if (sacked & TCPCB_RETRANS) {
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
*
* If the original is lost, there is no ambiguity. Otherwise
* we assume the original can be delayed up to aRTT + min_rtt.
* the aRTT term is bounded by the fast recovery or timeout,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
if (rtt_us < tcp_min_rtt(tp))
return;
}
tp->rack.rtt_us = rtt_us;
tp->rack.mstamp = xmit_time;
tp->rack.end_seq = end_seq;
tp->rack.advanced = 1;
}
/* We have waited long enough to accommodate reordering. Mark the expired
* packets lost and retransmit them.
*/
void tcp_rack_reo_timeout(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, prior_inflight;
prior_inflight = tcp_packets_in_flight(tp);
tcp_rack_detect_loss(sk, &timeout);
if (prior_inflight != tcp_packets_in_flight(tp)) {
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
tcp_enter_recovery(sk, false);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_cwnd_reduction(sk, 1, 0);
}
tcp_xmit_retransmit_queue(sk);
}
if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
tcp_rearm_rto(sk);
}