Skip to content

Commit 43e122b

Browse files
edumazetdavem330
authored andcommitted
tcp: refine pacing rate determination
When TCP pacing was added back in linux-3.12, we chose to apply a fixed ratio of 200 % against current rate, to allow probing for optimal throughput even during slow start phase, where cwnd can be doubled every other gRTT. At Google, we found it was better applying a different ratio while in Congestion Avoidance phase. This ratio was set to 120 %. We've used the normal tcp_in_slow_start() helper for a while, then tuned the condition to select the conservative ratio as soon as cwnd >= ssthresh/2 : - After cwnd reduction, it is safer to ramp up more slowly, as we approach optimal cwnd. - Initial ramp up (ssthresh == INFINITY) still allows doubling cwnd every other RTT. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 4ec3b28 commit 43e122b

File tree

4 files changed

+53
-1
lines changed

4 files changed

+53
-1
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER
586586
if available window is too small.
587587
Default: 2
588588

589+
tcp_pacing_ss_ratio - INTEGER
590+
sk->sk_pacing_rate is set by TCP stack using a ratio applied
591+
to current rate. (current_rate = cwnd * mss / srtt)
592+
If TCP is in slow start, tcp_pacing_ss_ratio is applied
593+
to let TCP probe for bigger speeds, assuming cwnd can be
594+
doubled every other RTT.
595+
Default: 200
596+
597+
tcp_pacing_ca_ratio - INTEGER
598+
sk->sk_pacing_rate is set by TCP stack using a ratio applied
599+
to current rate. (current_rate = cwnd * mss / srtt)
600+
If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
601+
is applied to conservatively probe for bigger throughput.
602+
Default: 120
603+
589604
tcp_tso_win_divisor - INTEGER
590605
This allows control over what percentage of the congestion window
591606
can be consumed by a single TSO frame.

include/net/tcp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat;
281281
extern int sysctl_tcp_min_tso_segs;
282282
extern int sysctl_tcp_autocorking;
283283
extern int sysctl_tcp_invalid_ratelimit;
284+
extern int sysctl_tcp_pacing_ss_ratio;
285+
extern int sysctl_tcp_pacing_ca_ratio;
284286

285287
extern atomic_long_t tcp_memory_allocated;
286288
extern struct percpu_counter tcp_sockets_allocated;

net/ipv4/sysctl_net_ipv4.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
static int zero;
3030
static int one = 1;
3131
static int four = 4;
32+
static int thousand = 1000;
3233
static int gso_max_segs = GSO_MAX_SEGS;
3334
static int tcp_retr1_max = 255;
3435
static int ip_local_port_range_min[] = { 1, 1 };
@@ -711,6 +712,24 @@ static struct ctl_table ipv4_table[] = {
711712
.extra1= &one,
712713
.extra2= &gso_max_segs,
713714
},
715+
{
716+
.procname= "tcp_pacing_ss_ratio",
717+
.data= &sysctl_tcp_pacing_ss_ratio,
718+
.maxlen= sizeof(int),
719+
.mode= 0644,
720+
.proc_handler= proc_dointvec_minmax,
721+
.extra1= &zero,
722+
.extra2= &thousand,
723+
},
724+
{
725+
.procname= "tcp_pacing_ca_ratio",
726+
.data= &sysctl_tcp_pacing_ca_ratio,
727+
.maxlen= sizeof(int),
728+
.mode= 0644,
729+
.proc_handler= proc_dointvec_minmax,
730+
.extra1= &zero,
731+
.extra2= &thousand,
732+
},
714733
{
715734
.procname= "tcp_autocorking",
716735
.data= &sysctl_tcp_autocorking,

net/ipv4/tcp_input.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
753753
* TCP pacing, to smooth the burst on large writes when packets
754754
* in flight is significantly lower than cwnd (or rwin)
755755
*/
756+
int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
757+
int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
758+
756759
static void tcp_update_pacing_rate(struct sock *sk)
757760
{
758761
const struct tcp_sock *tp = tcp_sk(sk);
759762
u64 rate;
760763

761764
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
762-
rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
765+
rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
766+
767+
/* current rate is (cwnd * mss) / srtt
768+
* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
769+
* In Congestion Avoidance phase, set it to 120 % the current rate.
770+
*
771+
* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
772+
* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
773+
* end of slow start and should slow down.
774+
*/
775+
if (tp->snd_cwnd < tp->snd_ssthresh / 2)
776+
rate *= sysctl_tcp_pacing_ss_ratio;
777+
else
778+
rate *= sysctl_tcp_pacing_ca_ratio;
763779

764780
rate *= max(tp->snd_cwnd, tp->packets_out);
765781

0 commit comments

Comments
 (0)