1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD: stable/10/sys/netinet/tcp_timer.c 330303 2018-03-03 00:54:12Z jhb $");
34 
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
38 
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/mbuf.h>
43 #include <sys/mutex.h>
44 #include <sys/protosw.h>
45 #include <sys/smp.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50 
51 #include <net/if.h>
52 #include <net/route.h>
53 #include <net/vnet.h>
54 
55 #include <netinet/cc.h>
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_systm.h>
59 #ifdef INET6
60 #include <netinet6/in6_pcb.h>
61 #endif
62 #include <netinet/ip_var.h>
63 #include <netinet/tcp_fsm.h>
64 #include <netinet/tcp_timer.h>
65 #include <netinet/tcp_var.h>
66 #ifdef INET6
67 #include <netinet6/tcp6_var.h>
68 #endif
69 #include <netinet/tcpip.h>
70 #ifdef TCPDEBUG
71 #include <netinet/tcp_debug.h>
72 #endif
73 
74 int    tcp_persmin;
75 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
76     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
77 
78 int    tcp_persmax;
79 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
80     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
81 
82 int	tcp_keepinit;
83 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
84     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
85 
86 int	tcp_keepidle;
87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
88     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
89 
90 int	tcp_keepintvl;
91 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
92     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
93 
94 int	tcp_delacktime;
95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
96     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
97     "Time before a delayed ACK is sent");
98 
99 int	tcp_msl;
100 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
101     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
102 
103 int	tcp_rexmit_min;
104 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
105     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
106     "Minimum Retransmission Timeout");
107 
108 int	tcp_rexmit_slop;
109 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
110     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
111     "Retransmission Timer Slop");
112 
113 int	tcp_always_keepalive = 1;
114 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
115     &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
116 __strong_reference(tcp_always_keepalive, always_keepalive);
117 
118 int    tcp_fast_finwait2_recycle = 0;
119 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
120     &tcp_fast_finwait2_recycle, 0,
121     "Recycle closed FIN_WAIT_2 connections faster");
122 
123 int    tcp_finwait2_timeout;
124 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
125     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
126 
127 int	tcp_keepcnt = TCPTV_KEEPCNT;
128 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
129     "Number of keepalive probes to send");
130 
131 	/* max idle probes */
132 int	tcp_maxpersistidle;
133 
134 static int	tcp_rexmit_drop_options = 0;
135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
136     &tcp_rexmit_drop_options, 0,
137     "Drop TCP options from 3rd and later retransmitted SYN");
138 
139 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
140 #define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
141 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
142     CTLFLAG_RW,
143     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
144     "Path MTU Discovery Black Hole Detection Enabled");
145 
146 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
147 #define	V_tcp_pmtud_blackhole_activated \
148     VNET(tcp_pmtud_blackhole_activated)
149 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
150     CTLFLAG_RD,
151     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
152     "Path MTU Discovery Black Hole Detection, Activation Count");
153 
154 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
155 #define	V_tcp_pmtud_blackhole_activated_min_mss \
156     VNET(tcp_pmtud_blackhole_activated_min_mss)
157 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
158     CTLFLAG_RD,
159     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
160     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
161 
162 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
163 #define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
164 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
165     CTLFLAG_RD,
166     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
167     "Path MTU Discovery Black Hole Detection, Failure Count");
168 
169 #ifdef INET
170 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
171 #define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
172 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
173     CTLFLAG_RW,
174     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
175     "Path MTU Discovery Black Hole Detection lowered MSS");
176 #endif
177 
178 #ifdef INET6
179 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
180 #define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
181 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
182     CTLFLAG_RW,
183     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
184     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
185 #endif
186 
187 static int	per_cpu_timers = 0;
188 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
189     &per_cpu_timers , 0, "run tcp timers on all cpus");
190 
191 #define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
192 		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
193 
194 /*
195  * Tcp protocol timeout routine called every 500 ms.
196  * Updates timestamps used for TCP
197  * causes finite state machine actions if timers expire.
198  */
199 void
tcp_slowtimo(void)200 tcp_slowtimo(void)
201 {
202 	VNET_ITERATOR_DECL(vnet_iter);
203 
204 	VNET_LIST_RLOCK_NOSLEEP();
205 	VNET_FOREACH(vnet_iter) {
206 		CURVNET_SET(vnet_iter);
207 		(void) tcp_tw_2msl_scan(0);
208 		CURVNET_RESTORE();
209 	}
210 	VNET_LIST_RUNLOCK_NOSLEEP();
211 }
212 
213 int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
214     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
215 
216 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
217     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
218 
219 static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
220 
221 /*
222  * TCP timer processing.
223  */
224 
225 void
tcp_timer_delack(void * xtp)226 tcp_timer_delack(void *xtp)
227 {
228 	struct tcpcb *tp = xtp;
229 	struct inpcb *inp;
230 	CURVNET_SET(tp->t_vnet);
231 
232 	inp = tp->t_inpcb;
233 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
234 	INP_WLOCK(inp);
235 	if (callout_pending(&tp->t_timers->tt_delack) ||
236 	    !callout_active(&tp->t_timers->tt_delack)) {
237 		INP_WUNLOCK(inp);
238 		CURVNET_RESTORE();
239 		return;
240 	}
241 	callout_deactivate(&tp->t_timers->tt_delack);
242 	if ((inp->inp_flags & INP_DROPPED) != 0) {
243 		INP_WUNLOCK(inp);
244 		CURVNET_RESTORE();
245 		return;
246 	}
247 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
248 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
249 	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
250 		("%s: tp %p delack callout should be running", __func__, tp));
251 
252 	tp->t_flags |= TF_ACKNOW;
253 	TCPSTAT_INC(tcps_delack);
254 	(void) tcp_output(tp);
255 	INP_WUNLOCK(inp);
256 	CURVNET_RESTORE();
257 }
258 
259 void
tcp_timer_2msl(void * xtp)260 tcp_timer_2msl(void *xtp)
261 {
262 	struct tcpcb *tp = xtp;
263 	struct inpcb *inp;
264 	CURVNET_SET(tp->t_vnet);
265 #ifdef TCPDEBUG
266 	int ostate;
267 
268 	ostate = tp->t_state;
269 #endif
270 	INP_INFO_RLOCK(&V_tcbinfo);
271 	inp = tp->t_inpcb;
272 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
273 	INP_WLOCK(inp);
274 	tcp_free_sackholes(tp);
275 	if (callout_pending(&tp->t_timers->tt_2msl) ||
276 	    !callout_active(&tp->t_timers->tt_2msl)) {
277 		INP_WUNLOCK(tp->t_inpcb);
278 		INP_INFO_RUNLOCK(&V_tcbinfo);
279 		CURVNET_RESTORE();
280 		return;
281 	}
282 	callout_deactivate(&tp->t_timers->tt_2msl);
283 	if ((inp->inp_flags & INP_DROPPED) != 0) {
284 		INP_WUNLOCK(inp);
285 		INP_INFO_RUNLOCK(&V_tcbinfo);
286 		CURVNET_RESTORE();
287 		return;
288 	}
289 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
290 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
291 	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
292 		("%s: tp %p 2msl callout should be running", __func__, tp));
293 	/*
294 	 * 2 MSL timeout in shutdown went off.  If we're closed but
295 	 * still waiting for peer to close and connection has been idle
296 	 * too long delete connection control block.  Otherwise, check
297 	 * again in a bit.
298 	 *
299 	 * If in TIME_WAIT state just ignore as this timeout is handled in
300 	 * tcp_tw_2msl_scan().
301 	 *
302 	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
303 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
304 	 * Ignore fact that there were recent incoming segments.
305 	 */
306 	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
307 		INP_WUNLOCK(inp);
308 		INP_INFO_RUNLOCK(&V_tcbinfo);
309 		CURVNET_RESTORE();
310 		return;
311 	}
312 	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
313 	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
314 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
315 		TCPSTAT_INC(tcps_finwait2_drops);
316 		tp = tcp_close(tp);
317 	} else {
318 		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
319 			if (!callout_reset(&tp->t_timers->tt_2msl,
320 			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
321 				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
322 			}
323 		} else
324 		       tp = tcp_close(tp);
325        }
326 
327 #ifdef TCPDEBUG
328 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
329 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
330 			  PRU_SLOWTIMO);
331 #endif
332 	if (tp != NULL)
333 		INP_WUNLOCK(inp);
334 	INP_INFO_RUNLOCK(&V_tcbinfo);
335 	CURVNET_RESTORE();
336 }
337 
338 void
tcp_timer_keep(void * xtp)339 tcp_timer_keep(void *xtp)
340 {
341 	struct tcpcb *tp = xtp;
342 	struct tcptemp *t_template;
343 	struct inpcb *inp;
344 	CURVNET_SET(tp->t_vnet);
345 #ifdef TCPDEBUG
346 	int ostate;
347 
348 	ostate = tp->t_state;
349 #endif
350 	INP_INFO_RLOCK(&V_tcbinfo);
351 	inp = tp->t_inpcb;
352 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
353 	INP_WLOCK(inp);
354 	if (callout_pending(&tp->t_timers->tt_keep) ||
355 	    !callout_active(&tp->t_timers->tt_keep)) {
356 		INP_WUNLOCK(inp);
357 		INP_INFO_RUNLOCK(&V_tcbinfo);
358 		CURVNET_RESTORE();
359 		return;
360 	}
361 	callout_deactivate(&tp->t_timers->tt_keep);
362 	if ((inp->inp_flags & INP_DROPPED) != 0) {
363 		INP_WUNLOCK(inp);
364 		INP_INFO_RUNLOCK(&V_tcbinfo);
365 		CURVNET_RESTORE();
366 		return;
367 	}
368 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
369 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
370 	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
371 		("%s: tp %p keep callout should be running", __func__, tp));
372 	/*
373 	 * Keep-alive timer went off; send something
374 	 * or drop connection if idle for too long.
375 	 */
376 	TCPSTAT_INC(tcps_keeptimeo);
377 	if (tp->t_state < TCPS_ESTABLISHED)
378 		goto dropit;
379 	if ((tcp_always_keepalive ||
380 	    inp->inp_socket->so_options & SO_KEEPALIVE) &&
381 	    tp->t_state <= TCPS_CLOSING) {
382 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
383 			goto dropit;
384 		/*
385 		 * Send a packet designed to force a response
386 		 * if the peer is up and reachable:
387 		 * either an ACK if the connection is still alive,
388 		 * or an RST if the peer has closed the connection
389 		 * due to timeout or reboot.
390 		 * Using sequence number tp->snd_una-1
391 		 * causes the transmitted zero-length segment
392 		 * to lie outside the receive window;
393 		 * by the protocol spec, this requires the
394 		 * correspondent TCP to respond.
395 		 */
396 		TCPSTAT_INC(tcps_keepprobe);
397 		t_template = tcpip_maketemplate(inp);
398 		if (t_template) {
399 			tcp_respond(tp, t_template->tt_ipgen,
400 				    &t_template->tt_t, (struct mbuf *)NULL,
401 				    tp->rcv_nxt, tp->snd_una - 1, 0);
402 			free(t_template, M_TEMP);
403 		}
404 		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
405 		    tcp_timer_keep, tp)) {
406 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
407 		}
408 	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
409 		    tcp_timer_keep, tp)) {
410 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
411 		}
412 
413 #ifdef TCPDEBUG
414 	if (inp->inp_socket->so_options & SO_DEBUG)
415 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
416 			  PRU_SLOWTIMO);
417 #endif
418 	INP_WUNLOCK(inp);
419 	INP_INFO_RUNLOCK(&V_tcbinfo);
420 	CURVNET_RESTORE();
421 	return;
422 
423 dropit:
424 	TCPSTAT_INC(tcps_keepdrops);
425 	tp = tcp_drop(tp, ETIMEDOUT);
426 
427 #ifdef TCPDEBUG
428 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
429 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
430 			  PRU_SLOWTIMO);
431 #endif
432 	if (tp != NULL)
433 		INP_WUNLOCK(tp->t_inpcb);
434 	INP_INFO_RUNLOCK(&V_tcbinfo);
435 	CURVNET_RESTORE();
436 }
437 
438 void
tcp_timer_persist(void * xtp)439 tcp_timer_persist(void *xtp)
440 {
441 	struct tcpcb *tp = xtp;
442 	struct inpcb *inp;
443 	CURVNET_SET(tp->t_vnet);
444 #ifdef TCPDEBUG
445 	int ostate;
446 
447 	ostate = tp->t_state;
448 #endif
449 	INP_INFO_RLOCK(&V_tcbinfo);
450 	inp = tp->t_inpcb;
451 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
452 	INP_WLOCK(inp);
453 	if (callout_pending(&tp->t_timers->tt_persist) ||
454 	    !callout_active(&tp->t_timers->tt_persist)) {
455 		INP_WUNLOCK(inp);
456 		INP_INFO_RUNLOCK(&V_tcbinfo);
457 		CURVNET_RESTORE();
458 		return;
459 	}
460 	callout_deactivate(&tp->t_timers->tt_persist);
461 	if ((inp->inp_flags & INP_DROPPED) != 0) {
462 		INP_WUNLOCK(inp);
463 		INP_INFO_RUNLOCK(&V_tcbinfo);
464 		CURVNET_RESTORE();
465 		return;
466 	}
467 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
468 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
469 	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
470 		("%s: tp %p persist callout should be running", __func__, tp));
471 	/*
472 	 * Persistance timer into zero window.
473 	 * Force a byte to be output, if possible.
474 	 */
475 	TCPSTAT_INC(tcps_persisttimeo);
476 	/*
477 	 * Hack: if the peer is dead/unreachable, we do not
478 	 * time out if the window is closed.  After a full
479 	 * backoff, drop the connection if the idle time
480 	 * (no responses to probes) reaches the maximum
481 	 * backoff that we would use if retransmitting.
482 	 */
483 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
484 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
485 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
486 		TCPSTAT_INC(tcps_persistdrop);
487 		tp = tcp_drop(tp, ETIMEDOUT);
488 		goto out;
489 	}
490 	/*
491 	 * If the user has closed the socket then drop a persisting
492 	 * connection after a much reduced timeout.
493 	 */
494 	if (tp->t_state > TCPS_CLOSE_WAIT &&
495 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
496 		TCPSTAT_INC(tcps_persistdrop);
497 		tp = tcp_drop(tp, ETIMEDOUT);
498 		goto out;
499 	}
500 	tcp_setpersist(tp);
501 	tp->t_flags |= TF_FORCEDATA;
502 	(void) tcp_output(tp);
503 	tp->t_flags &= ~TF_FORCEDATA;
504 
505 out:
506 #ifdef TCPDEBUG
507 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
508 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
509 #endif
510 	if (tp != NULL)
511 		INP_WUNLOCK(inp);
512 	INP_INFO_RUNLOCK(&V_tcbinfo);
513 	CURVNET_RESTORE();
514 }
515 
516 void
tcp_timer_rexmt(void * xtp)517 tcp_timer_rexmt(void * xtp)
518 {
519 	struct tcpcb *tp = xtp;
520 	CURVNET_SET(tp->t_vnet);
521 	int rexmt;
522 	int headlocked;
523 	struct inpcb *inp;
524 #ifdef TCPDEBUG
525 	int ostate;
526 
527 	ostate = tp->t_state;
528 #endif
529 
530 	INP_INFO_RLOCK(&V_tcbinfo);
531 	inp = tp->t_inpcb;
532 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
533 	INP_WLOCK(inp);
534 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
535 	    !callout_active(&tp->t_timers->tt_rexmt)) {
536 		INP_WUNLOCK(inp);
537 		INP_INFO_RUNLOCK(&V_tcbinfo);
538 		CURVNET_RESTORE();
539 		return;
540 	}
541 	callout_deactivate(&tp->t_timers->tt_rexmt);
542 	if ((inp->inp_flags & INP_DROPPED) != 0) {
543 		INP_WUNLOCK(inp);
544 		INP_INFO_RUNLOCK(&V_tcbinfo);
545 		CURVNET_RESTORE();
546 		return;
547 	}
548 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
549 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
550 	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
551 		("%s: tp %p rexmt callout should be running", __func__, tp));
552 	tcp_free_sackholes(tp);
553 	/*
554 	 * Retransmission timer went off.  Message has not
555 	 * been acked within retransmit interval.  Back off
556 	 * to a longer retransmit interval and retransmit one segment.
557 	 */
558 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
559 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
560 		TCPSTAT_INC(tcps_timeoutdrop);
561 
562 		tp = tcp_drop(tp, tp->t_softerror ?
563 			      tp->t_softerror : ETIMEDOUT);
564 		headlocked = 1;
565 		goto out;
566 	}
567 	INP_INFO_RUNLOCK(&V_tcbinfo);
568 	headlocked = 0;
569 	if (tp->t_state == TCPS_SYN_SENT) {
570 		/*
571 		 * If the SYN was retransmitted, indicate CWND to be
572 		 * limited to 1 segment in cc_conn_init().
573 		 */
574 		tp->snd_cwnd = 1;
575 	} else if (tp->t_rxtshift == 1) {
576 		/*
577 		 * first retransmit; record ssthresh and cwnd so they can
578 		 * be recovered if this turns out to be a "bad" retransmit.
579 		 * A retransmit is considered "bad" if an ACK for this
580 		 * segment is received within RTT/2 interval; the assumption
581 		 * here is that the ACK was already in flight.  See
582 		 * "On Estimating End-to-End Network Path Properties" by
583 		 * Allman and Paxson for more details.
584 		 */
585 		tp->snd_cwnd_prev = tp->snd_cwnd;
586 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
587 		tp->snd_recover_prev = tp->snd_recover;
588 		if (IN_FASTRECOVERY(tp->t_flags))
589 			tp->t_flags |= TF_WASFRECOVERY;
590 		else
591 			tp->t_flags &= ~TF_WASFRECOVERY;
592 		if (IN_CONGRECOVERY(tp->t_flags))
593 			tp->t_flags |= TF_WASCRECOVERY;
594 		else
595 			tp->t_flags &= ~TF_WASCRECOVERY;
596 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
597 		tp->t_flags |= TF_PREVVALID;
598 	} else
599 		tp->t_flags &= ~TF_PREVVALID;
600 	TCPSTAT_INC(tcps_rexmttimeo);
601 	if ((tp->t_state == TCPS_SYN_SENT) ||
602 	    (tp->t_state == TCPS_SYN_RECEIVED))
603 		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
604 	else
605 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
606 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
607 		      tp->t_rttmin, TCPTV_REXMTMAX);
608 
609 	/*
610 	 * We enter the path for PLMTUD if connection is established or, if
611 	 * connection is FIN_WAIT_1 status, reason for the last is that if
612 	 * amount of data we send is very small, we could send it in couple of
613 	 * packets and process straight to FIN. In that case we won't catch
614 	 * ESTABLISHED state.
615 	 */
616 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
617 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
618 		int optlen;
619 #ifdef INET6
620 		int isipv6;
621 #endif
622 
623 		/*
624 		 * Idea here is that at each stage of mtu probe (usually, 1448
625 		 * -> 1188 -> 524) should be given 2 chances to recover before
626 		 *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
627 		 *  take care of that.
628 		 */
629 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
630 		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
631 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
632 			/*
633 			 * Enter Path MTU Black-hole Detection mechanism:
634 			 * - Disable Path MTU Discovery (IP "DF" bit).
635 			 * - Reduce MTU to lower value than what we
636 			 *   negotiated with peer.
637 			 */
638 			/* Record that we may have found a black hole. */
639 			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
640 
641 			/* Keep track of previous MSS. */
642 			optlen = tp->t_maxopd - tp->t_maxseg;
643 			tp->t_pmtud_saved_maxopd = tp->t_maxopd;
644 
645 			/*
646 			 * Reduce the MSS to blackhole value or to the default
647 			 * in an attempt to retransmit.
648 			 */
649 #ifdef INET6
650 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
651 			if (isipv6 &&
652 			    tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
653 				/* Use the sysctl tuneable blackhole MSS. */
654 				tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
655 				V_tcp_pmtud_blackhole_activated++;
656 			} else if (isipv6) {
657 				/* Use the default MSS. */
658 				tp->t_maxopd = V_tcp_v6mssdflt;
659 				/*
660 				 * Disable Path MTU Discovery when we switch to
661 				 * minmss.
662 				 */
663 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
664 				V_tcp_pmtud_blackhole_activated_min_mss++;
665 			}
666 #endif
667 #if defined(INET6) && defined(INET)
668 			else
669 #endif
670 #ifdef INET
671 			if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
672 				/* Use the sysctl tuneable blackhole MSS. */
673 				tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
674 				V_tcp_pmtud_blackhole_activated++;
675 			} else {
676 				/* Use the default MSS. */
677 				tp->t_maxopd = V_tcp_mssdflt;
678 				/*
679 				 * Disable Path MTU Discovery when we switch to
680 				 * minmss.
681 				 */
682 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
683 				V_tcp_pmtud_blackhole_activated_min_mss++;
684 			}
685 #endif
686 			tp->t_maxseg = tp->t_maxopd - optlen;
687 			/*
688 			 * Reset the slow-start flight size
689 			 * as it may depend on the new MSS.
690 			 */
691 			if (CC_ALGO(tp)->conn_init != NULL)
692 				CC_ALGO(tp)->conn_init(tp->ccv);
693 		} else {
694 			/*
695 			 * If further retransmissions are still unsuccessful
696 			 * with a lowered MTU, maybe this isn't a blackhole and
697 			 * we restore the previous MSS and blackhole detection
698 			 * flags.
699 			 * The limit '6' is determined by giving each probe
700 			 * stage (1448, 1188, 524) 2 chances to recover.
701 			 */
702 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
703 			    (tp->t_rxtshift > 6)) {
704 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
705 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
706 				optlen = tp->t_maxopd - tp->t_maxseg;
707 				tp->t_maxopd = tp->t_pmtud_saved_maxopd;
708 				tp->t_maxseg = tp->t_maxopd - optlen;
709 				V_tcp_pmtud_blackhole_failed++;
710 				/*
711 				 * Reset the slow-start flight size as it
712 				 * may depend on the new MSS.
713 				 */
714 				if (CC_ALGO(tp)->conn_init != NULL)
715 					CC_ALGO(tp)->conn_init(tp->ccv);
716 			}
717 		}
718 	}
719 
720 	/*
721 	 * Disable RFC1323 and SACK if we haven't got any response to
722 	 * our third SYN to work-around some broken terminal servers
723 	 * (most of which have hopefully been retired) that have bad VJ
724 	 * header compression code which trashes TCP segments containing
725 	 * unknown-to-them TCP options.
726 	 */
727 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
728 	    (tp->t_rxtshift == 3))
729 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
730 	/*
731 	 * If we backed off this far, our srtt estimate is probably bogus.
732 	 * Clobber it so we'll take the next rtt measurement as our srtt;
733 	 * move the current srtt into rttvar to keep the current
734 	 * retransmit times until then.
735 	 */
736 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
737 #ifdef INET6
738 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
739 			in6_losing(tp->t_inpcb);
740 #endif
741 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
742 		tp->t_srtt = 0;
743 	}
744 	tp->snd_nxt = tp->snd_una;
745 	tp->snd_recover = tp->snd_max;
746 	/*
747 	 * Force a segment to be sent.
748 	 */
749 	tp->t_flags |= TF_ACKNOW;
750 	/*
751 	 * If timing a segment in this window, stop the timer.
752 	 */
753 	tp->t_rtttime = 0;
754 
755 	cc_cong_signal(tp, NULL, CC_RTO);
756 
757 	(void) tcp_output(tp);
758 
759 out:
760 #ifdef TCPDEBUG
761 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
762 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
763 			  PRU_SLOWTIMO);
764 #endif
765 	if (tp != NULL)
766 		INP_WUNLOCK(inp);
767 	if (headlocked)
768 		INP_INFO_RUNLOCK(&V_tcbinfo);
769 	CURVNET_RESTORE();
770 }
771 
772 void
tcp_timer_activate(struct tcpcb * tp,uint32_t timer_type,u_int delta)773 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
774 {
775 	struct callout *t_callout;
776 	timeout_t *f_callout;
777 	struct inpcb *inp = tp->t_inpcb;
778 	int cpu = INP_CPU(inp);
779 	uint32_t f_reset;
780 
781 #ifdef TCP_OFFLOAD
782 	if (tp->t_flags & TF_TOE)
783 		return;
784 #endif
785 
786 	if (tp->t_timers->tt_flags & TT_STOPPED)
787 		return;
788 
789 	switch (timer_type) {
790 		case TT_DELACK:
791 			t_callout = &tp->t_timers->tt_delack;
792 			f_callout = tcp_timer_delack;
793 			f_reset = TT_DELACK_RST;
794 			break;
795 		case TT_REXMT:
796 			t_callout = &tp->t_timers->tt_rexmt;
797 			f_callout = tcp_timer_rexmt;
798 			f_reset = TT_REXMT_RST;
799 			break;
800 		case TT_PERSIST:
801 			t_callout = &tp->t_timers->tt_persist;
802 			f_callout = tcp_timer_persist;
803 			f_reset = TT_PERSIST_RST;
804 			break;
805 		case TT_KEEP:
806 			t_callout = &tp->t_timers->tt_keep;
807 			f_callout = tcp_timer_keep;
808 			f_reset = TT_KEEP_RST;
809 			break;
810 		case TT_2MSL:
811 			t_callout = &tp->t_timers->tt_2msl;
812 			f_callout = tcp_timer_2msl;
813 			f_reset = TT_2MSL_RST;
814 			break;
815 		default:
816 			panic("tp %p bad timer_type %#x", tp, timer_type);
817 		}
818 	if (delta == 0) {
819 		if ((tp->t_timers->tt_flags & timer_type) &&
820 		    callout_stop(t_callout) &&
821 		    (tp->t_timers->tt_flags & f_reset)) {
822 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
823 		}
824 	} else {
825 		if ((tp->t_timers->tt_flags & timer_type) == 0) {
826 			tp->t_timers->tt_flags |= (timer_type | f_reset);
827 			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
828 		} else {
829 			/* Reset already running callout on the same CPU. */
830 			if (!callout_reset(t_callout, delta, f_callout, tp)) {
831 				/*
832 				 * Callout not cancelled, consider it as not
833 				 * properly restarted. */
834 				tp->t_timers->tt_flags &= ~f_reset;
835 			}
836 		}
837 	}
838 }
839 
840 int
tcp_timer_active(struct tcpcb * tp,uint32_t timer_type)841 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
842 {
843 	struct callout *t_callout;
844 
845 	switch (timer_type) {
846 		case TT_DELACK:
847 			t_callout = &tp->t_timers->tt_delack;
848 			break;
849 		case TT_REXMT:
850 			t_callout = &tp->t_timers->tt_rexmt;
851 			break;
852 		case TT_PERSIST:
853 			t_callout = &tp->t_timers->tt_persist;
854 			break;
855 		case TT_KEEP:
856 			t_callout = &tp->t_timers->tt_keep;
857 			break;
858 		case TT_2MSL:
859 			t_callout = &tp->t_timers->tt_2msl;
860 			break;
861 		default:
862 			panic("tp %p bad timer_type %#x", tp, timer_type);
863 		}
864 	return callout_active(t_callout);
865 }
866 
867 void
tcp_timer_stop(struct tcpcb * tp,uint32_t timer_type)868 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
869 {
870 	struct callout *t_callout;
871 	timeout_t *f_callout;
872 	uint32_t f_reset;
873 
874 	tp->t_timers->tt_flags |= TT_STOPPED;
875 
876 	switch (timer_type) {
877 		case TT_DELACK:
878 			t_callout = &tp->t_timers->tt_delack;
879 			f_callout = tcp_timer_delack_discard;
880 			f_reset = TT_DELACK_RST;
881 			break;
882 		case TT_REXMT:
883 			t_callout = &tp->t_timers->tt_rexmt;
884 			f_callout = tcp_timer_rexmt_discard;
885 			f_reset = TT_REXMT_RST;
886 			break;
887 		case TT_PERSIST:
888 			t_callout = &tp->t_timers->tt_persist;
889 			f_callout = tcp_timer_persist_discard;
890 			f_reset = TT_PERSIST_RST;
891 			break;
892 		case TT_KEEP:
893 			t_callout = &tp->t_timers->tt_keep;
894 			f_callout = tcp_timer_keep_discard;
895 			f_reset = TT_KEEP_RST;
896 			break;
897 		case TT_2MSL:
898 			t_callout = &tp->t_timers->tt_2msl;
899 			f_callout = tcp_timer_2msl_discard;
900 			f_reset = TT_2MSL_RST;
901 			break;
902 		default:
903 			panic("tp %p bad timer_type %#x", tp, timer_type);
904 		}
905 
906 	if (tp->t_timers->tt_flags & timer_type) {
907 		if (callout_stop(t_callout) &&
908 		    (tp->t_timers->tt_flags & f_reset)) {
909 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
910 		} else {
911 			/*
912 			 * Can't stop the callout, defer tcpcb actual deletion
913 			 * to the last tcp timer discard callout.
914 			 * The TT_STOPPED flag will ensure that no tcp timer
915 			 * callouts can be restarted on our behalf, and
916 			 * past this point currently running callouts waiting
917 			 * on inp lock will return right away after the
918 			 * classical check for callout reset/stop events:
919 			 * callout_pending() || !callout_active()
920 			 */
921 			callout_reset(t_callout, 1, f_callout, tp);
922 		}
923 	}
924 }
925 
926 #define	ticks_to_msecs(t)	(1000*(t) / hz)
927 
928 void
tcp_timer_to_xtimer(struct tcpcb * tp,struct tcp_timer * timer,struct xtcp_timer * xtimer)929 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
930     struct xtcp_timer *xtimer)
931 {
932 	sbintime_t now;
933 
934 	bzero(xtimer, sizeof(*xtimer));
935 	if (timer == NULL)
936 		return;
937 	now = getsbinuptime();
938 	if (callout_active(&timer->tt_delack))
939 		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
940 	if (callout_active(&timer->tt_rexmt))
941 		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
942 	if (callout_active(&timer->tt_persist))
943 		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
944 	if (callout_active(&timer->tt_keep))
945 		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
946 	if (callout_active(&timer->tt_2msl))
947 		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
948 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
949 }
950