1 /*        $NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $        */
2 
3 /*-
4  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9  * Facility, NASA Ames Research Center.
10  * This code is derived from software contributed to The NetBSD Foundation
11  * by Charles M. Hannum.
12  * This code is derived from software contributed to The NetBSD Foundation
13  * by Rui Paulo.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
39  * All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the project nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  */
65 
66 /*
67  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
68  *
69  * NRL grants permission for redistribution and use in source and binary
70  * forms, with or without modification, of the software and documentation
71  * created at NRL provided that the following conditions are met:
72  *
73  * 1. Redistributions of source code must retain the above copyright
74  *    notice, this list of conditions and the following disclaimer.
75  * 2. Redistributions in binary form must reproduce the above copyright
76  *    notice, this list of conditions and the following disclaimer in the
77  *    documentation and/or other materials provided with the distribution.
78  * 3. All advertising materials mentioning features or use of this software
79  *    must display the following acknowledgements:
80  *      This product includes software developed by the University of
81  *      California, Berkeley and its contributors.
82  *      This product includes software developed at the Information
83  *      Technology Division, US Naval Research Laboratory.
84  * 4. Neither the name of the NRL nor the names of its contributors
85  *    may be used to endorse or promote products derived from this software
86  *    without specific prior written permission.
87  *
88  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
89  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
90  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
91  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
92  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
93  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
94  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
95  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
96  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
97  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
98  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99  *
100  * The views and conclusions contained in the software and documentation
101  * are those of the authors and should not be interpreted as representing
102  * official policies, either expressed or implied, of the US Naval
103  * Research Laboratory (NRL).
104  */
105 
106 /*
107  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108  *        The Regents of the University of California.  All rights reserved.
109  *
110  * Redistribution and use in source and binary forms, with or without
111  * modification, are permitted provided that the following conditions
112  * are met:
113  * 1. Redistributions of source code must retain the above copyright
114  *    notice, this list of conditions and the following disclaimer.
115  * 2. Redistributions in binary form must reproduce the above copyright
116  *    notice, this list of conditions and the following disclaimer in the
117  *    documentation and/or other materials provided with the distribution.
118  * 3. Neither the name of the University nor the names of its contributors
119  *    may be used to endorse or promote products derived from this software
120  *    without specific prior written permission.
121  *
122  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132  * SUCH DAMAGE.
133  *
134  *        @(#)tcp_input.c     8.12 (Berkeley) 5/24/95
135  */
136 
137 #include <sys/cdefs.h>
138 __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $");
139 
140 #ifdef _KERNEL_OPT
141 #include "opt_inet.h"
142 #include "opt_tcp_debug.h"
143 #include "opt_tcp_congctl.h"
144 #endif
145 
146 #include <sys/param.h>
147 #include <sys/systm.h>
148 #include <sys/malloc.h>
149 #include <sys/mbuf.h>
150 #include <sys/protosw.h>
151 #include <sys/socket.h>
152 #include <sys/socketvar.h>
153 #include <sys/errno.h>
154 #include <sys/syslog.h>
155 #include <sys/pool.h>
156 #include <sys/domain.h>
157 #include <sys/kernel.h>
158 #include <sys/mutex.h>
159 
160 #include <net/if.h>
161 
162 #include <netinet/in.h>
163 #include <netinet/in_systm.h>
164 #include <netinet/ip.h>
165 #include <netinet/in_pcb.h>
166 #include <netinet/in_var.h>
167 #include <netinet/ip_var.h>
168 
169 #ifdef INET6
170 #include <netinet/ip6.h>
171 #include <netinet6/ip6_var.h>
172 #include <netinet6/in6_pcb.h>
173 #include <netinet6/ip6_var.h>
174 #include <netinet6/in6_var.h>
175 #include <netinet/icmp6.h>
176 #endif
177 
178 #include <netinet/tcp.h>
179 #include <netinet/tcp_fsm.h>
180 #include <netinet/tcp_seq.h>
181 #include <netinet/tcp_timer.h>
182 #include <netinet/tcp_var.h>
183 #include <netinet/tcp_congctl.h>
184 #ifdef TCP_DEBUG
185 #include <netinet/tcp_debug.h>
186 #endif
187 
188 /*
189  * TODO:
190  *   consider separating the actual implementations in another file.
191  */
192 
193 static void tcp_common_congestion_exp(struct tcpcb *, int, int);
194 
195 static int  tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
196 static int  tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
197 static void tcp_reno_slow_retransmit(struct tcpcb *);
198 static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
199     const struct tcphdr *);
200 static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
201 static void tcp_reno_congestion_exp(struct tcpcb *tp);
202 
203 static int  tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
204 static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
205           const struct tcphdr *);
206 static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
207 
208 static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
209 static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
210 static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
211 static void tcp_cubic_congestion_exp(struct tcpcb *);
212 
213 static void tcp_congctl_fillnames(void);
214 
215 extern int tcprexmtthresh;
216 
217 MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
218 
219 /* currently selected global congestion control */
220 char tcp_congctl_global_name[TCPCC_MAXLEN];
221 
222 /* available global congestion control algorithms */
223 char tcp_congctl_avail[10 * TCPCC_MAXLEN];
224 
225 /*
226  * Used to list the available congestion control algorithms.
227  */
228 TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
229     TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
230 
231 static struct tcp_congctlent * tcp_congctl_global;
232 
233 static kmutex_t tcp_congctl_mtx;
234 
235 void
tcp_congctl_init(void)236 tcp_congctl_init(void)
237 {
238           int r __diagused;
239 
240           mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
241 
242           /* Base algorithms. */
243           r = tcp_congctl_register("reno", &tcp_reno_ctl);
244           KASSERT(r == 0);
245           r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
246           KASSERT(r == 0);
247           r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
248           KASSERT(r == 0);
249 
250           /* NewReno is the default. */
251 #ifndef TCP_CONGCTL_DEFAULT
252 #define TCP_CONGCTL_DEFAULT "newreno"
253 #endif
254 
255           r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
256           KASSERT(r == 0);
257 }
258 
259 /*
260  * Register a congestion algorithm and select it if we have none.
261  */
262 int
tcp_congctl_register(const char * name,const struct tcp_congctl * tcc)263 tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
264 {
265           struct tcp_congctlent *ntcc, *tccp;
266 
267           TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
268                     if (!strcmp(name, tccp->congctl_name)) {
269                               /* name already registered */
270                               return EEXIST;
271                     }
272 
273           ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
274 
275           strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
276           ntcc->congctl_ctl = tcc;
277 
278           TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
279           tcp_congctl_fillnames();
280 
281           if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
282                     tcp_congctl_select(NULL, name);
283 
284           return 0;
285 }
286 
287 int
tcp_congctl_unregister(const char * name)288 tcp_congctl_unregister(const char *name)
289 {
290           struct tcp_congctlent *tccp, *rtccp;
291           unsigned int size;
292 
293           rtccp = NULL;
294           size = 0;
295           TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
296                     if (!strcmp(name, tccp->congctl_name))
297                               rtccp = tccp;
298                     size++;
299           }
300 
301           if (!rtccp)
302                     return ENOENT;
303 
304           if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
305                     return EBUSY;
306 
307           TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
308           free(rtccp, M_TCPCONGCTL);
309           tcp_congctl_fillnames();
310 
311           return 0;
312 }
313 
314 /*
315  * Select a congestion algorithm by name.
316  */
317 int
tcp_congctl_select(struct tcpcb * tp,const char * name)318 tcp_congctl_select(struct tcpcb *tp, const char *name)
319 {
320           struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
321           bool old_found, new_found;
322 
323           KASSERT(name);
324 
325           old_found = (tp == NULL || tp->t_congctl == NULL);
326           old_tccp = NULL;
327           new_found = false;
328           new_tccp = NULL;
329 
330           TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
331                     if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
332                               old_tccp = tccp;
333                               old_found = true;
334                     }
335 
336                     if (!new_found && !strcmp(name, tccp->congctl_name)) {
337                               new_tccp = tccp;
338                               new_found = true;
339                     }
340 
341                     if (new_found && old_found) {
342                               if (tp) {
343                                         mutex_enter(&tcp_congctl_mtx);
344                                         if (old_tccp)
345                                                   old_tccp->congctl_refcnt--;
346                                         tp->t_congctl = new_tccp->congctl_ctl;
347                                         new_tccp->congctl_refcnt++;
348                                         mutex_exit(&tcp_congctl_mtx);
349                               } else {
350                                         tcp_congctl_global = new_tccp;
351                                         strlcpy(tcp_congctl_global_name,
352                                             new_tccp->congctl_name,
353                                             sizeof(tcp_congctl_global_name) - 1);
354                               }
355                               return 0;
356                     }
357           }
358 
359           return EINVAL;
360 }
361 
362 void
tcp_congctl_release(struct tcpcb * tp)363 tcp_congctl_release(struct tcpcb *tp)
364 {
365           struct tcp_congctlent *tccp;
366 
367           KASSERT(tp->t_congctl);
368 
369           TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
370                     if (tccp->congctl_ctl == tp->t_congctl) {
371                               tccp->congctl_refcnt--;
372                               return;
373                     }
374           }
375 }
376 
377 /*
378  * Returns the name of a congestion algorithm.
379  */
380 const char *
tcp_congctl_bystruct(const struct tcp_congctl * tcc)381 tcp_congctl_bystruct(const struct tcp_congctl *tcc)
382 {
383           struct tcp_congctlent *tccp;
384 
385           KASSERT(tcc);
386 
387           TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
388                     if (tccp->congctl_ctl == tcc)
389                               return tccp->congctl_name;
390 
391           return NULL;
392 }
393 
394 static void
tcp_congctl_fillnames(void)395 tcp_congctl_fillnames(void)
396 {
397           struct tcp_congctlent *tccp;
398           const char *delim = " ";
399 
400           tcp_congctl_avail[0] = '\0';
401           TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
402                     strlcat(tcp_congctl_avail, tccp->congctl_name,
403                         sizeof(tcp_congctl_avail) - 1);
404                     if (TAILQ_NEXT(tccp, congctl_ent))
405                               strlcat(tcp_congctl_avail, delim,
406                                   sizeof(tcp_congctl_avail) - 1);
407           }
408 
409 }
410 
411 /* ------------------------------------------------------------------------ */
412 
413 /*
414  * Common stuff
415  */
416 
417 /* Window reduction (1-beta) for [New]Reno: 0.5 */
418 #define RENO_BETAA 1
419 #define RENO_BETAB 2
420 /* Window reduction (1-beta) for Cubic: 0.8 */
421 #define CUBIC_BETAA 4
422 #define CUBIC_BETAB 5
423 /* Draft Rhee Section 4.1 */
424 #define CUBIC_CA 4
425 #define CUBIC_CB 10
426 
427 static void
tcp_common_congestion_exp(struct tcpcb * tp,int betaa,int betab)428 tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
429 {
430           u_long win;
431 
432           /*
433            * Reduce the congestion window and the slow start threshold.
434            */
435           win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
436           if (win < 2)
437                     win = 2;
438 
439           tp->snd_ssthresh = win * tp->t_segsz;
440           tp->snd_recover = tp->snd_max;
441           tp->snd_cwnd = tp->snd_ssthresh;
442 
443           /*
444            * When using TCP ECN, notify the peer that
445            * we reduced the cwnd.
446            */
447           if (TCP_ECN_ALLOWED(tp))
448                     tp->t_flags |= TF_ECN_SND_CWR;
449 }
450 
451 
452 /* ------------------------------------------------------------------------ */
453 
454 /*
455  * TCP/Reno congestion control.
456  */
457 static void
tcp_reno_congestion_exp(struct tcpcb * tp)458 tcp_reno_congestion_exp(struct tcpcb *tp)
459 {
460 
461           tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
462 }
463 
464 static int
tcp_reno_do_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)465 tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
466 {
467           /*
468            * Dup acks mean that packets have left the
469            * network (they're now cached at the receiver)
470            * so bump cwnd by the amount in the receiver
471            * to keep a constant cwnd packets in the
472            * network.
473            *
474            * If we are using TCP/SACK, then enter
475            * Fast Recovery if the receiver SACKs
476            * data that is tcprexmtthresh * MSS
477            * bytes past the last ACKed segment,
478            * irrespective of the number of DupAcks.
479            */
480 
481           tcp_seq onxt = tp->snd_nxt;
482 
483           tp->t_partialacks = 0;
484           TCP_TIMER_DISARM(tp, TCPT_REXMT);
485           tp->t_rtttime = 0;
486           if (TCP_SACK_ENABLED(tp)) {
487                     tp->t_dupacks = tcprexmtthresh;
488                     tp->sack_newdata = tp->snd_nxt;
489                     tp->snd_cwnd = tp->t_segsz;
490                     (void) tcp_output(tp);
491                     return 0;
492           }
493           tp->snd_nxt = th->th_ack;
494           tp->snd_cwnd = tp->t_segsz;
495           (void) tcp_output(tp);
496           tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
497           if (SEQ_GT(onxt, tp->snd_nxt))
498                     tp->snd_nxt = onxt;
499 
500           return 0;
501 }
502 
503 static int
tcp_reno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)504 tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
505 {
506 
507           /*
508            * We know we're losing at the current
509            * window size so do congestion avoidance
510            * (set ssthresh to half the current window
511            * and pull our congestion window back to
512            * the new ssthresh).
513            */
514 
515           tcp_reno_congestion_exp(tp);
516           return tcp_reno_do_fast_retransmit(tp, th);
517 }
518 
519 static void
tcp_reno_slow_retransmit(struct tcpcb * tp)520 tcp_reno_slow_retransmit(struct tcpcb *tp)
521 {
522           u_long win;
523 
524           /*
525            * Close the congestion window down to one segment
526            * (we'll open it by one segment for each ack we get).
527            * Since we probably have a window's worth of unacked
528            * data accumulated, this "slow start" keeps us from
529            * dumping all that data as back-to-back packets (which
530            * might overwhelm an intermediate gateway).
531            *
532            * There are two phases to the opening: Initially we
533            * open by one mss on each ack.  This makes the window
534            * size increase exponentially with time.  If the
535            * window is larger than the path can handle, this
536            * exponential growth results in dropped packet(s)
537            * almost immediately.  To get more time between
538            * drops but still "push" the network to take advantage
539            * of improving conditions, we switch from exponential
540            * to linear window opening at some threshold size.
541            * For a threshold, we use half the current window
542            * size, truncated to a multiple of the mss.
543            *
544            * (the minimum cwnd that will give us exponential
545            * growth is 2 mss.  We don't allow the threshold
546            * to go below this.)
547            */
548 
549           win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
550           if (win < 2)
551                     win = 2;
552           /* Loss Window MUST be one segment. */
553           tp->snd_cwnd = tp->t_segsz;
554           tp->snd_ssthresh = win * tp->t_segsz;
555           tp->t_partialacks = -1;
556           tp->t_dupacks = 0;
557           tp->t_bytes_acked = 0;
558 
559           if (TCP_ECN_ALLOWED(tp))
560                     tp->t_flags |= TF_ECN_SND_CWR;
561 }
562 
563 static void
tcp_reno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)564 tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
565     const struct tcphdr *th)
566 {
567           if (tp->t_partialacks < 0) {
568                     /*
569                      * We were not in fast recovery.  Reset the duplicate ack
570                      * counter.
571                      */
572                     tp->t_dupacks = 0;
573           } else {
574                     /*
575                      * Clamp the congestion window to the crossover point and
576                      * exit fast recovery.
577                      */
578                     if (tp->snd_cwnd > tp->snd_ssthresh)
579                               tp->snd_cwnd = tp->snd_ssthresh;
580                     tp->t_partialacks = -1;
581                     tp->t_dupacks = 0;
582                     tp->t_bytes_acked = 0;
583                     if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
584                               tp->snd_fack = th->th_ack;
585           }
586 }
587 
588 static void
tcp_reno_newack(struct tcpcb * tp,const struct tcphdr * th)589 tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
590 {
591           /*
592            * When new data is acked, open the congestion window.
593            */
594 
595           u_int cw = tp->snd_cwnd;
596           u_int incr = tp->t_segsz;
597 
598           if (tcp_do_abc) {
599 
600                     /*
601                      * RFC 3465 Appropriate Byte Counting (ABC)
602                      */
603 
604                     int acked = th->th_ack - tp->snd_una;
605 
606                     if (cw >= tp->snd_ssthresh) {
607                               tp->t_bytes_acked += acked;
608                               if (tp->t_bytes_acked >= cw) {
609                                         /* Time to increase the window. */
610                                         tp->t_bytes_acked -= cw;
611                               } else {
612                                         /* No need to increase yet. */
613                                         incr = 0;
614                               }
615                     } else {
616                               /*
617                                * use 2*SMSS or 1*SMSS for the "L" param,
618                                * depending on sysctl setting.
619                                *
620                                * (See RFC 3465 2.3 Choosing the Limit)
621                                */
622                               u_int abc_lim;
623 
624                               abc_lim = (tcp_abc_aggressive == 0 ||
625                                   tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
626                               incr = uimin(acked, abc_lim);
627                     }
628           } else {
629 
630                     /*
631                      * If the window gives us less than ssthresh packets
632                      * in flight, open exponentially (segsz per packet).
633                      * Otherwise open linearly: segsz per window
634                      * (segsz^2 / cwnd per packet).
635                      */
636 
637                     if (cw >= tp->snd_ssthresh) {
638                               incr = incr * incr / cw;
639                     }
640           }
641 
642           tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
643 }
644 
645 const struct tcp_congctl tcp_reno_ctl = {
646           .fast_retransmit = tcp_reno_fast_retransmit,
647           .slow_retransmit = tcp_reno_slow_retransmit,
648           .fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
649           .newack = tcp_reno_newack,
650           .cong_exp = tcp_reno_congestion_exp,
651 };
652 
653 /*
654  * TCP/NewReno Congestion control.
655  */
656 static int
tcp_newreno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)657 tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
658 {
659 
660           if (SEQ_LT(th->th_ack, tp->snd_high)) {
661                     /*
662                      * False fast retransmit after timeout.
663                      * Do not enter fast recovery
664                      */
665                     tp->t_dupacks = 0;
666                     return 1;
667           }
668           /*
669            * Fast retransmit is same as reno.
670            */
671           return tcp_reno_fast_retransmit(tp, th);
672 }
673 
674 /*
675  * Implement the NewReno response to a new ack, checking for partial acks in
676  * fast recovery.
677  */
678 static void
tcp_newreno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)679 tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
680 {
681           if (tp->t_partialacks < 0) {
682                     /*
683                      * We were not in fast recovery.  Reset the duplicate ack
684                      * counter.
685                      */
686                     tp->t_dupacks = 0;
687           } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
688                     /*
689                      * This is a partial ack.  Retransmit the first unacknowledged
690                      * segment and deflate the congestion window by the amount of
691                      * acknowledged data.  Do not exit fast recovery.
692                      */
693                     tcp_seq onxt = tp->snd_nxt;
694                     u_long ocwnd = tp->snd_cwnd;
695                     int sack_num_segs = 1, sack_bytes_rxmt = 0;
696 
697                     /*
698                      * snd_una has not yet been updated and the socket's send
699                      * buffer has not yet drained off the ACK'd data, so we
700                      * have to leave snd_una as it was to get the correct data
701                      * offset in tcp_output().
702                      */
703                     tp->t_partialacks++;
704                     TCP_TIMER_DISARM(tp, TCPT_REXMT);
705                     tp->t_rtttime = 0;
706 
707                     if (TCP_SACK_ENABLED(tp)) {
708                               /*
709                                * Partial ack handling within a sack recovery episode.
710                                * Keeping this very simple for now. When a partial ack
711                                * is received, force snd_cwnd to a value that will
712                                * allow the sender to transmit no more than 2 segments.
713                                * If necessary, a fancier scheme can be adopted at a
714                                * later point, but for now, the goal is to prevent the
715                                * sender from bursting a large amount of data in the
716                                * midst of sack recovery.
717                                */
718 
719                               /*
720                                * send one or 2 segments based on how much
721                                * new data was acked
722                                */
723                               if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
724                                         sack_num_segs = 2;
725                               (void)tcp_sack_output(tp, &sack_bytes_rxmt);
726                               tp->snd_cwnd = sack_bytes_rxmt +
727                                   (tp->snd_nxt - tp->sack_newdata) +
728                                   sack_num_segs * tp->t_segsz;
729                               tp->t_flags |= TF_ACKNOW;
730                               (void) tcp_output(tp);
731                     } else {
732                               tp->snd_nxt = th->th_ack;
733                               /*
734                                * Set snd_cwnd to one segment beyond ACK'd offset
735                                * snd_una is not yet updated when we're called
736                                */
737                               tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
738                               (void) tcp_output(tp);
739                               tp->snd_cwnd = ocwnd;
740                               if (SEQ_GT(onxt, tp->snd_nxt))
741                                         tp->snd_nxt = onxt;
742                               /*
743                                * Partial window deflation.  Relies on fact that
744                                * tp->snd_una not updated yet.
745                                */
746                               tp->snd_cwnd -= (th->th_ack - tp->snd_una -
747                                   tp->t_segsz);
748                     }
749           } else {
750                     /*
751                      * Complete ack.  Inflate the congestion window to ssthresh
752                      * and exit fast recovery.
753                      *
754                      * Window inflation should have left us with approx.
755                      * snd_ssthresh outstanding data.  But in case we
756                      * would be inclined to send a burst, better to do
757                      * it via the slow start mechanism.
758                      */
759                     if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
760                               tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
761                                   + tp->t_segsz;
762                     else
763                               tp->snd_cwnd = tp->snd_ssthresh;
764                     tp->t_partialacks = -1;
765                     tp->t_dupacks = 0;
766                     tp->t_bytes_acked = 0;
767                     if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
768                               tp->snd_fack = th->th_ack;
769           }
770 }
771 
772 static void
tcp_newreno_newack(struct tcpcb * tp,const struct tcphdr * th)773 tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
774 {
775           /*
776            * If we are still in fast recovery (meaning we are using
777            * NewReno and we have only received partial acks), do not
778            * inflate the window yet.
779            */
780           if (tp->t_partialacks < 0)
781                     tcp_reno_newack(tp, th);
782 }
783 
784 
785 const struct tcp_congctl tcp_newreno_ctl = {
786           .fast_retransmit = tcp_newreno_fast_retransmit,
787           .slow_retransmit = tcp_reno_slow_retransmit,
788           .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
789           .newack = tcp_newreno_newack,
790           .cong_exp = tcp_reno_congestion_exp,
791 };
792 
793 /*
794  * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
795  */
796 
797 /* Cubic prototypes */
798 static void         tcp_cubic_update_ctime(struct tcpcb *tp);
799 static uint32_t     tcp_cubic_diff_ctime(struct tcpcb *);
800 static uint32_t     tcp_cubic_cbrt(uint32_t);
801 static ulong        tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
802 
803 /* Cubic TIME functions - XXX I don't like using timevals and microuptime */
804 /*
805  * Set congestion timer to now
806  */
807 static void
tcp_cubic_update_ctime(struct tcpcb * tp)808 tcp_cubic_update_ctime(struct tcpcb *tp)
809 {
810           struct timeval now_timeval;
811 
812           getmicrouptime(&now_timeval);
813           tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
814               now_timeval.tv_usec / 1000;
815 }
816 
817 /*
818  * milliseconds from last congestion
819  */
820 static uint32_t
tcp_cubic_diff_ctime(struct tcpcb * tp)821 tcp_cubic_diff_ctime(struct tcpcb *tp)
822 {
823           struct timeval now_timeval;
824 
825           getmicrouptime(&now_timeval);
826           return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
827               tp->snd_cubic_ctime;
828 }
829 
830 /*
831  * Approximate cubic root
832  */
833 #define CBRT_ROUNDS 30
834 static uint32_t
tcp_cubic_cbrt(uint32_t v)835 tcp_cubic_cbrt(uint32_t v)
836 {
837           int i, rounds = CBRT_ROUNDS;
838           uint64_t x = v / 3;
839 
840           /* We fail to calculate correct for small numbers */
841           if (v == 0)
842                     return 0;
843           else if (v < 4)
844                     return 1;
845 
846           /*
847            * largest x that 2*x^3+3*x fits 64bit
848            * Avoid overflow for a time cost
849            */
850           if (x > 2097151)
851                     rounds += 10;
852 
853           for (i = 0; i < rounds; i++)
854                     if (rounds == CBRT_ROUNDS)
855                               x = (v + 2 * x * x * x) / (3 * x * x);
856                     else
857                               /* Avoid overflow */
858                               x = v / (3 * x * x) + 2 * x / 3;
859 
860           return (uint32_t)x;
861 }
862 
863 /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
864 static ulong
tcp_cubic_getW(struct tcpcb * tp,uint32_t ms_elapsed,uint32_t rtt)865 tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
866 {
867           uint32_t K;
868           long tK3;
869 
870           /* Section 3.1 Eq. 2 */
871           K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
872               CUBIC_CB / CUBIC_CA);
873           /*  (t-K)^3 - not clear why is the measure unit mattering */
874           tK3 = (long)(ms_elapsed + rtt) - (long)K;
875           tK3 = tK3 * tK3 * tK3;
876 
877           return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
878 }
879 
880 static void
tcp_cubic_congestion_exp(struct tcpcb * tp)881 tcp_cubic_congestion_exp(struct tcpcb *tp)
882 {
883 
884           /*
885            * Congestion - Set WMax and shrink cwnd
886            */
887           tcp_cubic_update_ctime(tp);
888 
889           /* Section 3.6 - Fast Convergence */
890           if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
891                     tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
892                     tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
893                         tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
894           } else {
895                     tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
896                     tp->snd_cubic_wmax = tp->snd_cwnd;
897           }
898 
899           tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);
900 
901           /* Shrink CWND */
902           tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
903 }
904 
905 static int
tcp_cubic_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)906 tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
907 {
908 
909           if (SEQ_LT(th->th_ack, tp->snd_high)) {
910                     /* See newreno */
911                     tp->t_dupacks = 0;
912                     return 1;
913           }
914 
915           /*
916            * mark WMax
917            */
918           tcp_cubic_congestion_exp(tp);
919 
920           /* Do fast retransmit */
921           return tcp_reno_do_fast_retransmit(tp, th);
922 }
923 
924 static void
tcp_cubic_newack(struct tcpcb * tp,const struct tcphdr * th)925 tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
926 {
927           uint32_t ms_elapsed, rtt;
928           u_long w_tcp;
929 
930           /* Congestion avoidance and not in fast recovery and usable rtt */
931           if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
932               /*
933                * t_srtt is 1/32 units of slow ticks
934                * converting it in ms would be equal to
935                * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
936                */
937               (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
938                     ms_elapsed = tcp_cubic_diff_ctime(tp);
939 
940                     /* Compute W_tcp(t) */
941                     w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
942                         ms_elapsed / rtt / 3;
943 
944                     if (tp->snd_cwnd > w_tcp) {
945                               /* Not in TCP friendly mode */
946                               tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
947                                   tp->snd_cwnd) / tp->snd_cwnd;
948                     } else {
949                               /* friendly TCP mode */
950                               tp->snd_cwnd = w_tcp;
951                     }
952 
953                     /* Make sure we are within limits */
954                     tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
955                     tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
956           } else {
957                     /* Use New Reno */
958                     tcp_newreno_newack(tp, th);
959           }
960 }
961 
962 static void
tcp_cubic_slow_retransmit(struct tcpcb * tp)963 tcp_cubic_slow_retransmit(struct tcpcb *tp)
964 {
965 
966           /* Timeout - Mark new congestion */
967           tcp_cubic_congestion_exp(tp);
968 
969           /* Loss Window MUST be one segment. */
970           tp->snd_cwnd = tp->t_segsz;
971           tp->t_partialacks = -1;
972           tp->t_dupacks = 0;
973           tp->t_bytes_acked = 0;
974 
975           if (TCP_ECN_ALLOWED(tp))
976                     tp->t_flags |= TF_ECN_SND_CWR;
977 }
978 
979 const struct tcp_congctl tcp_cubic_ctl = {
980           .fast_retransmit = tcp_cubic_fast_retransmit,
981           .slow_retransmit = tcp_cubic_slow_retransmit,
982           .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
983           .newack = tcp_cubic_newack,
984           .cong_exp = tcp_cubic_congestion_exp,
985 };
986