Thomas Gleixner | 09c434b | 2019-05-19 13:08:20 +0100 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 2 | /* |
| 3 | * TCP NV: TCP with Congestion Avoidance |
| 4 | * |
| 5 | * TCP-NV is a successor of TCP-Vegas that has been developed to |
| 6 | * deal with the issues that occur in modern networks. |
| 7 | * Like TCP-Vegas, TCP-NV supports true congestion avoidance, |
| 8 | * the ability to detect congestion before packet losses occur. |
| 9 | * When congestion (queue buildup) starts to occur, TCP-NV |
| 10 | * predicts what the cwnd size should be for the current |
| 11 | * throughput and it reduces the cwnd proportionally to |
| 12 | * the difference between the current cwnd and the predicted cwnd. |
| 13 | * |
| 14 | * NV is only recommeneded for traffic within a data center, and when |
| 15 | * all the flows are NV (at least those within the data center). This |
| 16 | * is due to the inherent unfairness between flows using losses to |
| 17 | * detect congestion (congestion control) and those that use queue |
| 18 | * buildup to detect congestion (congestion avoidance). |
| 19 | * |
| 20 | * Note: High NIC coalescence values may lower the performance of NV |
| 21 | * due to the increased noise in RTT values. In particular, we have |
| 22 | * seen issues with rx-frames values greater than 8. |
| 23 | * |
| 24 | * TODO: |
| 25 | * 1) Add mechanism to deal with reverse congestion. |
| 26 | */ |
| 27 | |
| 28 | #include <linux/mm.h> |
| 29 | #include <linux/module.h> |
| 30 | #include <linux/math64.h> |
| 31 | #include <net/tcp.h> |
| 32 | #include <linux/inet_diag.h> |
| 33 | |
| 34 | /* TCP NV parameters |
| 35 | * |
| 36 | * nv_pad Max number of queued packets allowed in network |
| 37 | * nv_pad_buffer Do not grow cwnd if this closed to nv_pad |
| 38 | * nv_reset_period How often (in) seconds)to reset min_rtt |
| 39 | * nv_min_cwnd Don't decrease cwnd below this if there are no losses |
| 40 | * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected |
| 41 | * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 |
| 42 | * nv_rtt_factor RTT averaging factor |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 43 | * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 44 | * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd |
| 45 | * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd |
| 46 | * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping |
| 47 | * slow-start due to congestion |
| 48 | * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion |
| 49 | * nv_rtt_min_cnt Wait these many RTTs before making congesion decision |
| 50 | * nv_cwnd_growth_rate_neg |
| 51 | * nv_cwnd_growth_rate_pos |
| 52 | * How quickly to double growth rate (not rate) of cwnd when not |
| 53 | * congested. One value (nv_cwnd_growth_rate_neg) for when |
| 54 | * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos) |
| 55 | * otherwise. |
| 56 | */ |
| 57 | |
| 58 | static int nv_pad __read_mostly = 10; |
| 59 | static int nv_pad_buffer __read_mostly = 2; |
| 60 | static int nv_reset_period __read_mostly = 5; /* in seconds */ |
| 61 | static int nv_min_cwnd __read_mostly = 2; |
| 62 | static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ |
| 63 | static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ |
| 64 | static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 65 | static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 66 | static int nv_cwnd_growth_rate_neg __read_mostly = 8; |
| 67 | static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ |
| 68 | static int nv_dec_eval_min_calls __read_mostly = 60; |
| 69 | static int nv_inc_eval_min_calls __read_mostly = 20; |
| 70 | static int nv_ssthresh_eval_min_calls __read_mostly = 30; |
| 71 | static int nv_stop_rtt_cnt __read_mostly = 10; |
| 72 | static int nv_rtt_min_cnt __read_mostly = 2; |
| 73 | |
| 74 | module_param(nv_pad, int, 0644); |
| 75 | MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network"); |
| 76 | module_param(nv_reset_period, int, 0644); |
| 77 | MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)"); |
| 78 | module_param(nv_min_cwnd, int, 0644); |
| 79 | MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value" |
| 80 | " without losses"); |
| 81 | |
| 82 | /* TCP NV Parameters */ |
| 83 | struct tcpnv { |
| 84 | unsigned long nv_min_rtt_reset_jiffies; /* when to switch to |
| 85 | * nv_min_rtt_new */ |
| 86 | s8 cwnd_growth_factor; /* Current cwnd growth factor, |
| 87 | * < 0 => less than 1 packet/RTT */ |
| 88 | u8 available8; |
| 89 | u16 available16; |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 90 | u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ |
| 91 | nv_reset:1, /* whether to reset values */ |
| 92 | nv_catchup:1; /* whether we are growing because |
| 93 | * of temporary cwnd decrease */ |
| 94 | u8 nv_eval_call_cnt; /* call count since last eval */ |
| 95 | u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is |
| 96 | * smaller than this. It may grow to handle |
| 97 | * TSO, LRO and interrupt coalescence because |
| 98 | * with these a small cwnd cannot saturate |
| 99 | * the link. Note that this is different from |
| 100 | * the file local nv_min_cwnd */ |
| 101 | u8 nv_rtt_cnt; /* RTTs without making ca decision */; |
| 102 | u32 nv_last_rtt; /* last rtt */ |
| 103 | u32 nv_min_rtt; /* active min rtt. Used to determine slope */ |
| 104 | u32 nv_min_rtt_new; /* min rtt for future use */ |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 105 | u32 nv_base_rtt; /* If non-zero it represents the threshold for |
| 106 | * congestion */ |
| 107 | u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is |
| 108 | * set to 80% of nv_base_rtt. It helps reduce |
| 109 | * unfairness between flows */ |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 110 | u32 nv_rtt_max_rate; /* max rate seen during current RTT */ |
| 111 | u32 nv_rtt_start_seq; /* current RTT ends when packet arrives |
| 112 | * acking beyond nv_rtt_start_seq */ |
| 113 | u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is |
| 114 | * used to determine bytes acked since last |
| 115 | * call to bictcp_acked */ |
| 116 | u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */ |
| 117 | }; |
| 118 | |
| 119 | #define NV_INIT_RTT U32_MAX |
| 120 | #define NV_MIN_CWND 4 |
| 121 | #define NV_MIN_CWND_GROW 2 |
| 122 | #define NV_TSO_CWND_BOUND 80 |
| 123 | |
| 124 | static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) |
| 125 | { |
| 126 | struct tcp_sock *tp = tcp_sk(sk); |
| 127 | |
| 128 | ca->nv_reset = 0; |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 129 | ca->nv_no_cong_cnt = 0; |
| 130 | ca->nv_rtt_cnt = 0; |
| 131 | ca->nv_last_rtt = 0; |
| 132 | ca->nv_rtt_max_rate = 0; |
| 133 | ca->nv_rtt_start_seq = tp->snd_una; |
| 134 | ca->nv_eval_call_cnt = 0; |
| 135 | ca->nv_last_snd_una = tp->snd_una; |
| 136 | } |
| 137 | |
| 138 | static void tcpnv_init(struct sock *sk) |
| 139 | { |
| 140 | struct tcpnv *ca = inet_csk_ca(sk); |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 141 | int base_rtt; |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 142 | |
| 143 | tcpnv_reset(ca, sk); |
| 144 | |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 145 | /* See if base_rtt is available from socket_ops bpf program. |
| 146 | * It is meant to be used in environments, such as communication |
| 147 | * within a datacenter, where we have reasonable estimates of |
| 148 | * RTTs |
| 149 | */ |
Lawrence Brakmo | de525be | 2018-01-25 16:14:09 -0800 | [diff] [blame] | 150 | base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 151 | if (base_rtt > 0) { |
| 152 | ca->nv_base_rtt = base_rtt; |
| 153 | ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ |
| 154 | } else { |
| 155 | ca->nv_base_rtt = 0; |
| 156 | ca->nv_lower_bound_rtt = 0; |
| 157 | } |
| 158 | |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 159 | ca->nv_allow_cwnd_growth = 1; |
| 160 | ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; |
| 161 | ca->nv_min_rtt = NV_INIT_RTT; |
| 162 | ca->nv_min_rtt_new = NV_INIT_RTT; |
| 163 | ca->nv_min_cwnd = NV_MIN_CWND; |
| 164 | ca->nv_catchup = 0; |
| 165 | ca->cwnd_growth_factor = 0; |
| 166 | } |
| 167 | |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 168 | /* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) |
| 169 | * bounds to RTT. |
| 170 | */ |
| 171 | inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) |
| 172 | { |
| 173 | if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) |
| 174 | return ca->nv_lower_bound_rtt; |
| 175 | else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) |
| 176 | return ca->nv_base_rtt; |
| 177 | else |
| 178 | return val; |
| 179 | } |
| 180 | |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 181 | static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) |
| 182 | { |
| 183 | struct tcp_sock *tp = tcp_sk(sk); |
| 184 | struct tcpnv *ca = inet_csk_ca(sk); |
| 185 | u32 cnt; |
| 186 | |
| 187 | if (!tcp_is_cwnd_limited(sk)) |
| 188 | return; |
| 189 | |
| 190 | /* Only grow cwnd if NV has not detected congestion */ |
| 191 | if (!ca->nv_allow_cwnd_growth) |
| 192 | return; |
| 193 | |
| 194 | if (tcp_in_slow_start(tp)) { |
| 195 | acked = tcp_slow_start(tp, acked); |
| 196 | if (!acked) |
| 197 | return; |
| 198 | } |
| 199 | |
| 200 | if (ca->cwnd_growth_factor < 0) { |
| 201 | cnt = tp->snd_cwnd << -ca->cwnd_growth_factor; |
| 202 | tcp_cong_avoid_ai(tp, cnt, acked); |
| 203 | } else { |
| 204 | cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor); |
| 205 | tcp_cong_avoid_ai(tp, cnt, acked); |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | static u32 tcpnv_recalc_ssthresh(struct sock *sk) |
| 210 | { |
| 211 | const struct tcp_sock *tp = tcp_sk(sk); |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 212 | |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 213 | return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); |
| 214 | } |
| 215 | |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 216 | static void tcpnv_state(struct sock *sk, u8 new_state) |
| 217 | { |
| 218 | struct tcpnv *ca = inet_csk_ca(sk); |
| 219 | |
| 220 | if (new_state == TCP_CA_Open && ca->nv_reset) { |
| 221 | tcpnv_reset(ca, sk); |
| 222 | } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR || |
| 223 | new_state == TCP_CA_Recovery) { |
| 224 | ca->nv_reset = 1; |
| 225 | ca->nv_allow_cwnd_growth = 0; |
| 226 | if (new_state == TCP_CA_Loss) { |
| 227 | /* Reset cwnd growth factor to Reno value */ |
| 228 | if (ca->cwnd_growth_factor > 0) |
| 229 | ca->cwnd_growth_factor = 0; |
| 230 | /* Decrease growth rate if allowed */ |
| 231 | if (nv_cwnd_growth_rate_neg > 0 && |
| 232 | ca->cwnd_growth_factor > -8) |
| 233 | ca->cwnd_growth_factor--; |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | /* Do congestion avoidance calculations for TCP-NV |
| 239 | */ |
| 240 | static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) |
| 241 | { |
| 242 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 243 | struct tcp_sock *tp = tcp_sk(sk); |
| 244 | struct tcpnv *ca = inet_csk_ca(sk); |
| 245 | unsigned long now = jiffies; |
Konstantin Khlebnikov | 991a26a | 2017-11-02 17:07:05 +0300 | [diff] [blame] | 246 | u64 rate64; |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 247 | u32 rate, max_win, cwnd_by_slope; |
| 248 | u32 avg_rtt; |
| 249 | u32 bytes_acked = 0; |
| 250 | |
| 251 | /* Some calls are for duplicates without timetamps */ |
| 252 | if (sample->rtt_us < 0) |
| 253 | return; |
| 254 | |
| 255 | /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */ |
| 256 | if (icsk->icsk_ca_state != TCP_CA_Open && |
| 257 | icsk->icsk_ca_state != TCP_CA_Disorder) |
| 258 | return; |
| 259 | |
| 260 | /* Stop cwnd growth if we were in catch up mode */ |
| 261 | if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) { |
| 262 | ca->nv_catchup = 0; |
| 263 | ca->nv_allow_cwnd_growth = 0; |
| 264 | } |
| 265 | |
| 266 | bytes_acked = tp->snd_una - ca->nv_last_snd_una; |
| 267 | ca->nv_last_snd_una = tp->snd_una; |
| 268 | |
| 269 | if (sample->in_flight == 0) |
| 270 | return; |
| 271 | |
| 272 | /* Calculate moving average of RTT */ |
| 273 | if (nv_rtt_factor > 0) { |
| 274 | if (ca->nv_last_rtt > 0) { |
| 275 | avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor + |
| 276 | ((u64)ca->nv_last_rtt) |
| 277 | * (256 - nv_rtt_factor)) >> 8; |
| 278 | } else { |
| 279 | avg_rtt = sample->rtt_us; |
| 280 | ca->nv_min_rtt = avg_rtt << 1; |
| 281 | } |
| 282 | ca->nv_last_rtt = avg_rtt; |
| 283 | } else { |
| 284 | avg_rtt = sample->rtt_us; |
| 285 | } |
| 286 | |
| 287 | /* rate in 100's bits per second */ |
Konstantin Khlebnikov | 991a26a | 2017-11-02 17:07:05 +0300 | [diff] [blame] | 288 | rate64 = ((u64)sample->in_flight) * 80000; |
| 289 | do_div(rate64, avg_rtt ?: 1); |
| 290 | rate = (u32)rate64; |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 291 | |
| 292 | /* Remember the maximum rate seen during this RTT |
| 293 | * Note: It may be more than one RTT. This function should be |
| 294 | * called at least nv_dec_eval_min_calls times. |
| 295 | */ |
| 296 | if (ca->nv_rtt_max_rate < rate) |
| 297 | ca->nv_rtt_max_rate = rate; |
| 298 | |
| 299 | /* We have valid information, increment counter */ |
| 300 | if (ca->nv_eval_call_cnt < 255) |
| 301 | ca->nv_eval_call_cnt++; |
| 302 | |
Lawrence Brakmo | 85cce21 | 2017-10-20 11:05:41 -0700 | [diff] [blame] | 303 | /* Apply bounds to rtt. Only used to update min_rtt */ |
| 304 | avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); |
| 305 | |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 306 | /* update min rtt if necessary */ |
| 307 | if (avg_rtt < ca->nv_min_rtt) |
| 308 | ca->nv_min_rtt = avg_rtt; |
| 309 | |
| 310 | /* update future min_rtt if necessary */ |
| 311 | if (avg_rtt < ca->nv_min_rtt_new) |
| 312 | ca->nv_min_rtt_new = avg_rtt; |
| 313 | |
| 314 | /* nv_min_rtt is updated with the minimum (possibley averaged) rtt |
| 315 | * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a |
| 316 | * warm reset). This new nv_min_rtt will be continued to be updated |
| 317 | * and be used for another sysctl_tcp_nv_reset_period seconds, |
| 318 | * when it will be updated again. |
| 319 | * In practice we introduce some randomness, so the actual period used |
| 320 | * is chosen randomly from the range: |
| 321 | * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4) |
| 322 | */ |
| 323 | if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { |
| 324 | unsigned char rand; |
| 325 | |
| 326 | ca->nv_min_rtt = ca->nv_min_rtt_new; |
| 327 | ca->nv_min_rtt_new = NV_INIT_RTT; |
| 328 | get_random_bytes(&rand, 1); |
| 329 | ca->nv_min_rtt_reset_jiffies = |
| 330 | now + ((nv_reset_period * (384 + rand) * HZ) >> 9); |
| 331 | /* Every so often we decrease ca->nv_min_cwnd in case previous |
| 332 | * value is no longer accurate. |
| 333 | */ |
| 334 | ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND); |
| 335 | } |
| 336 | |
| 337 | /* Once per RTT check if we need to do congestion avoidance */ |
| 338 | if (before(ca->nv_rtt_start_seq, tp->snd_una)) { |
| 339 | ca->nv_rtt_start_seq = tp->snd_nxt; |
| 340 | if (ca->nv_rtt_cnt < 0xff) |
| 341 | /* Increase counter for RTTs without CA decision */ |
| 342 | ca->nv_rtt_cnt++; |
| 343 | |
| 344 | /* If this function is only called once within an RTT |
| 345 | * the cwnd is probably too small (in some cases due to |
| 346 | * tso, lro or interrupt coalescence), so we increase |
| 347 | * ca->nv_min_cwnd. |
| 348 | */ |
| 349 | if (ca->nv_eval_call_cnt == 1 && |
| 350 | bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache && |
| 351 | ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) { |
| 352 | ca->nv_min_cwnd = min(ca->nv_min_cwnd |
| 353 | + NV_MIN_CWND_GROW, |
| 354 | NV_TSO_CWND_BOUND + 1); |
| 355 | ca->nv_rtt_start_seq = tp->snd_nxt + |
| 356 | ca->nv_min_cwnd * tp->mss_cache; |
| 357 | ca->nv_eval_call_cnt = 0; |
| 358 | ca->nv_allow_cwnd_growth = 1; |
| 359 | return; |
| 360 | } |
| 361 | |
| 362 | /* Find the ideal cwnd for current rate from slope |
| 363 | * slope = 80000.0 * mss / nv_min_rtt |
| 364 | * cwnd_by_slope = nv_rtt_max_rate / slope |
| 365 | */ |
| 366 | cwnd_by_slope = (u32) |
| 367 | div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, |
Gustavo A. R. Silva | e4823fb | 2018-01-30 22:21:48 -0600 | [diff] [blame] | 368 | 80000ULL * tp->mss_cache); |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 369 | max_win = cwnd_by_slope + nv_pad; |
| 370 | |
| 371 | /* If cwnd > max_win, decrease cwnd |
| 372 | * if cwnd < max_win, grow cwnd |
| 373 | * else leave the same |
| 374 | */ |
| 375 | if (tp->snd_cwnd > max_win) { |
| 376 | /* there is congestion, check that it is ok |
| 377 | * to make a CA decision |
| 378 | * 1. We should have at least nv_dec_eval_min_calls |
| 379 | * data points before making a CA decision |
| 380 | * 2. We only make a congesion decision after |
| 381 | * nv_rtt_min_cnt RTTs |
| 382 | */ |
| 383 | if (ca->nv_rtt_cnt < nv_rtt_min_cnt) { |
| 384 | return; |
| 385 | } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { |
| 386 | if (ca->nv_eval_call_cnt < |
| 387 | nv_ssthresh_eval_min_calls) |
| 388 | return; |
| 389 | /* otherwise we will decrease cwnd */ |
| 390 | } else if (ca->nv_eval_call_cnt < |
| 391 | nv_dec_eval_min_calls) { |
| 392 | if (ca->nv_allow_cwnd_growth && |
| 393 | ca->nv_rtt_cnt > nv_stop_rtt_cnt) |
| 394 | ca->nv_allow_cwnd_growth = 0; |
| 395 | return; |
| 396 | } |
| 397 | |
| 398 | /* We have enough data to determine we are congested */ |
| 399 | ca->nv_allow_cwnd_growth = 0; |
| 400 | tp->snd_ssthresh = |
| 401 | (nv_ssthresh_factor * max_win) >> 3; |
| 402 | if (tp->snd_cwnd - max_win > 2) { |
| 403 | /* gap > 2, we do exponential cwnd decrease */ |
| 404 | int dec; |
| 405 | |
| 406 | dec = max(2U, ((tp->snd_cwnd - max_win) * |
| 407 | nv_cong_dec_mult) >> 7); |
| 408 | tp->snd_cwnd -= dec; |
| 409 | } else if (nv_cong_dec_mult > 0) { |
| 410 | tp->snd_cwnd = max_win; |
| 411 | } |
| 412 | if (ca->cwnd_growth_factor > 0) |
| 413 | ca->cwnd_growth_factor = 0; |
| 414 | ca->nv_no_cong_cnt = 0; |
| 415 | } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) { |
| 416 | /* There is no congestion, grow cwnd if allowed*/ |
| 417 | if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) |
| 418 | return; |
| 419 | |
| 420 | ca->nv_allow_cwnd_growth = 1; |
| 421 | ca->nv_no_cong_cnt++; |
| 422 | if (ca->cwnd_growth_factor < 0 && |
| 423 | nv_cwnd_growth_rate_neg > 0 && |
| 424 | ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) { |
| 425 | ca->cwnd_growth_factor++; |
| 426 | ca->nv_no_cong_cnt = 0; |
| 427 | } else if (ca->cwnd_growth_factor >= 0 && |
| 428 | nv_cwnd_growth_rate_pos > 0 && |
| 429 | ca->nv_no_cong_cnt > |
| 430 | nv_cwnd_growth_rate_pos) { |
| 431 | ca->cwnd_growth_factor++; |
| 432 | ca->nv_no_cong_cnt = 0; |
| 433 | } |
| 434 | } else { |
| 435 | /* cwnd is in-between, so do nothing */ |
| 436 | return; |
| 437 | } |
| 438 | |
| 439 | /* update state */ |
| 440 | ca->nv_eval_call_cnt = 0; |
| 441 | ca->nv_rtt_cnt = 0; |
| 442 | ca->nv_rtt_max_rate = 0; |
| 443 | |
| 444 | /* Don't want to make cwnd < nv_min_cwnd |
| 445 | * (it wasn't before, if it is now is because nv |
| 446 | * decreased it). |
| 447 | */ |
| 448 | if (tp->snd_cwnd < nv_min_cwnd) |
| 449 | tp->snd_cwnd = nv_min_cwnd; |
| 450 | } |
| 451 | } |
| 452 | |
| 453 | /* Extract info for Tcp socket info provided via netlink */ |
stephen hemminger | c718c6d | 2017-05-19 09:55:52 -0700 | [diff] [blame] | 454 | static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, |
| 455 | union tcp_cc_info *info) |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 456 | { |
| 457 | const struct tcpnv *ca = inet_csk_ca(sk); |
| 458 | |
| 459 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { |
| 460 | info->vegas.tcpv_enabled = 1; |
| 461 | info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt; |
| 462 | info->vegas.tcpv_rtt = ca->nv_last_rtt; |
| 463 | info->vegas.tcpv_minrtt = ca->nv_min_rtt; |
| 464 | |
| 465 | *attr = INET_DIAG_VEGASINFO; |
| 466 | return sizeof(struct tcpvegas_info); |
| 467 | } |
| 468 | return 0; |
| 469 | } |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 470 | |
| 471 | static struct tcp_congestion_ops tcpnv __read_mostly = { |
| 472 | .init = tcpnv_init, |
| 473 | .ssthresh = tcpnv_recalc_ssthresh, |
| 474 | .cong_avoid = tcpnv_cong_avoid, |
| 475 | .set_state = tcpnv_state, |
Yuchung Cheng | f1722a1 | 2017-08-03 20:38:52 -0700 | [diff] [blame] | 476 | .undo_cwnd = tcp_reno_undo_cwnd, |
Lawrence Brakmo | 699fafa | 2016-06-08 21:16:45 -0700 | [diff] [blame] | 477 | .pkts_acked = tcpnv_acked, |
| 478 | .get_info = tcpnv_get_info, |
| 479 | |
| 480 | .owner = THIS_MODULE, |
| 481 | .name = "nv", |
| 482 | }; |
| 483 | |
| 484 | static int __init tcpnv_register(void) |
| 485 | { |
| 486 | BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); |
| 487 | |
| 488 | return tcp_register_congestion_control(&tcpnv); |
| 489 | } |
| 490 | |
| 491 | static void __exit tcpnv_unregister(void) |
| 492 | { |
| 493 | tcp_unregister_congestion_control(&tcpnv); |
| 494 | } |
| 495 | |
| 496 | module_init(tcpnv_register); |
| 497 | module_exit(tcpnv_unregister); |
| 498 | |
| 499 | MODULE_AUTHOR("Lawrence Brakmo"); |
| 500 | MODULE_LICENSE("GPL"); |
| 501 | MODULE_DESCRIPTION("TCP NV"); |
| 502 | MODULE_VERSION("1.0"); |