1 /* $OpenBSD: tcp_timer.c,v 1.37 2005/06/30 08:51:31 markus Exp $ */
2 /* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)tcp_timer.c 8.1 (Berkeley) 6/10/93
33 */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/protosw.h>
41 #include <sys/kernel.h>
42
43 #include <net/route.h>
44
45 #include <netinet/in.h>
46 #include <netinet/in_systm.h>
47 #include <netinet/ip.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/ip_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_timer.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/ip_icmp.h>
55 #include <netinet/tcp_seq.h>
56
57 int tcp_keepidle;
58 int tcp_keepintvl;
59 int tcp_maxpersistidle; /* max idle time in persist */
60 int tcp_maxidle;
61
62 /*
63 * Time to delay the ACK. This is initialized in tcp_init(), unless
64 * its patched.
65 */
66 int tcp_delack_ticks;
67
68 void tcp_timer_rexmt(void *);
69 void tcp_timer_persist(void *);
70 void tcp_timer_keep(void *);
71 void tcp_timer_2msl(void *);
72
73 const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS] = {
74 tcp_timer_rexmt,
75 tcp_timer_persist,
76 tcp_timer_keep,
77 tcp_timer_2msl,
78 };
79
80 /*
81 * Timer state initialization, called from tcp_init().
82 */
83 void
tcp_timer_init(void)84 tcp_timer_init(void)
85 {
86
87 if (tcp_keepidle == 0)
88 tcp_keepidle = TCPTV_KEEP_IDLE;
89
90 if (tcp_keepintvl == 0)
91 tcp_keepintvl = TCPTV_KEEPINTVL;
92
93 if (tcp_maxpersistidle == 0)
94 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
95
96 if (tcp_delack_ticks == 0)
97 tcp_delack_ticks = TCP_DELACK_TICKS;
98 }
99
100 /*
101 * Callout to process delayed ACKs for a TCPCB.
102 */
103 void
tcp_delack(void * arg)104 tcp_delack(void *arg)
105 {
106 struct tcpcb *tp = arg;
107 int s;
108
109 /*
110 * If tcp_output() wasn't able to transmit the ACK
111 * for whatever reason, it will restart the delayed
112 * ACK callout.
113 */
114
115 s = splsoftnet();
116 if (tp->t_flags & TF_DEAD) {
117 splx(s);
118 return;
119 }
120 tp->t_flags |= TF_ACKNOW;
121 (void) tcp_output(tp);
122 splx(s);
123 }
124
125 /*
126 * Tcp protocol timeout routine called every 500 ms.
127 * Updates the timers in all active tcb's and
128 * causes finite state machine actions if timers expire.
129 */
130 void
tcp_slowtimo()131 tcp_slowtimo()
132 {
133 int s;
134
135 s = splsoftnet();
136 tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl;
137 #ifdef TCP_COMPAT_42
138 tcp_iss += TCP_ISSINCR/PR_SLOWHZ; /* increment iss */
139 if ((int)tcp_iss < 0)
140 tcp_iss = 0; /* XXX */
141 #endif /* TCP_COMPAT_42 */
142 tcp_now++; /* for timestamps */
143 splx(s);
144 }
145
146 /*
147 * Cancel all timers for TCP tp.
148 */
149 void
tcp_canceltimers(tp)150 tcp_canceltimers(tp)
151 struct tcpcb *tp;
152 {
153 int i;
154
155 for (i = 0; i < TCPT_NTIMERS; i++)
156 TCP_TIMER_DISARM(tp, i);
157 }
158
159 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
160 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
161
162 int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
163
164 /*
165 * TCP timer processing.
166 */
167
168 #ifdef TCP_SACK
169 void tcp_timer_freesack(struct tcpcb *);
170
171 void
tcp_timer_freesack(struct tcpcb * tp)172 tcp_timer_freesack(struct tcpcb *tp)
173 {
174 struct sackhole *p, *q;
175 /*
176 * Free SACK holes for 2MSL and REXMT timers.
177 */
178 q = tp->snd_holes;
179 while (q != NULL) {
180 p = q;
181 q = q->next;
182 pool_put(&sackhl_pool, p);
183 }
184 tp->snd_holes = 0;
185 #ifdef TCP_FACK
186 tp->snd_fack = tp->snd_una;
187 tp->retran_data = 0;
188 tp->snd_awnd = 0;
189 #endif /* TCP_FACK */
190 }
191 #endif /* TCP_SACK */
192
193 void
tcp_timer_rexmt(void * arg)194 tcp_timer_rexmt(void *arg)
195 {
196 struct tcpcb *tp = arg;
197 uint32_t rto;
198 int s;
199
200 s = splsoftnet();
201 if (tp->t_flags & TF_DEAD) {
202 splx(s);
203 return;
204 }
205
206 if ((tp->t_flags & TF_PMTUD_PEND) && tp->t_inpcb &&
207 SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) &&
208 SEQ_LT(tp->t_pmtud_th_seq, (int)(tp->snd_una + tp->t_maxseg))) {
209 extern struct sockaddr_in icmpsrc;
210 struct icmp icmp;
211
212 tp->t_flags &= ~TF_PMTUD_PEND;
213
214 /* XXX create fake icmp message with relevant entries */
215 icmp.icmp_nextmtu = tp->t_pmtud_nextmtu;
216 icmp.icmp_ip.ip_len = tp->t_pmtud_ip_len;
217 icmp.icmp_ip.ip_hl = tp->t_pmtud_ip_hl;
218 icmpsrc.sin_addr = tp->t_inpcb->inp_faddr;
219 icmp_mtudisc(&icmp);
220
221 /*
222 * Notify all connections to the same peer about
223 * new mss and trigger retransmit.
224 */
225 in_pcbnotifyall(&tcbtable, sintosa(&icmpsrc), EMSGSIZE,
226 tcp_mtudisc);
227 splx(s);
228 return;
229 }
230
231 #ifdef TCP_SACK
232 tcp_timer_freesack(tp);
233 #endif
234 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
235 tp->t_rxtshift = TCP_MAXRXTSHIFT;
236 tcpstat.tcps_timeoutdrop++;
237 (void)tcp_drop(tp, tp->t_softerror ?
238 tp->t_softerror : ETIMEDOUT);
239 goto out;
240 }
241 tcpstat.tcps_rexmttimeo++;
242 rto = TCP_REXMTVAL(tp);
243 if (rto < tp->t_rttmin)
244 rto = tp->t_rttmin;
245 TCPT_RANGESET(tp->t_rxtcur,
246 rto * tcp_backoff[tp->t_rxtshift],
247 tp->t_rttmin, TCPTV_REXMTMAX);
248 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
249
250 /*
251 * If we are losing and we are trying path MTU discovery,
252 * try turning it off. This will avoid black holes in
253 * the network which suppress or fail to send "packet
254 * too big" ICMP messages. We should ideally do
255 * lots more sophisticated searching to find the right
256 * value here...
257 */
258 if (ip_mtudisc && tp->t_inpcb &&
259 TCPS_HAVEESTABLISHED(tp->t_state) &&
260 tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) {
261 struct inpcb *inp = tp->t_inpcb;
262 struct rtentry *rt = NULL;
263 struct sockaddr_in sin;
264
265 /* No data to send means path mtu is not a problem */
266 if (!inp->inp_socket->so_snd.sb_cc)
267 goto leave;
268
269 rt = in_pcbrtentry(inp);
270 /* Check if path MTU discovery is disabled already */
271 if (rt && (rt->rt_flags & RTF_HOST) &&
272 (rt->rt_rmx.rmx_locks & RTV_MTU))
273 goto leave;
274
275 rt = NULL;
276 switch(tp->pf) {
277 #ifdef INET6
278 case PF_INET6:
279 /*
280 * We can not turn off path MTU for IPv6.
281 * Do nothing for now, maybe lower to
282 * minimum MTU.
283 */
284 break;
285 #endif
286 case PF_INET:
287 bzero(&sin, sizeof(struct sockaddr_in));
288 sin.sin_family = AF_INET;
289 sin.sin_len = sizeof(struct sockaddr_in);
290 sin.sin_addr = inp->inp_faddr;
291 rt = icmp_mtudisc_clone(sintosa(&sin));
292 break;
293 }
294 if (rt != NULL) {
295 /* Disable path MTU discovery */
296 if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0) {
297 rt->rt_rmx.rmx_locks |= RTV_MTU;
298 in_rtchange(inp, 0);
299 }
300
301 rtfree(rt);
302 }
303 leave:
304 ;
305 }
306
307 /*
308 * If losing, let the lower level know and try for
309 * a better route. Also, if we backed off this far,
310 * our srtt estimate is probably bogus. Clobber it
311 * so we'll take the next rtt measurement as our srtt;
312 * move the current srtt into rttvar to keep the current
313 * retransmit times until then.
314 */
315 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
316 in_losing(tp->t_inpcb);
317 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
318 tp->t_srtt = 0;
319 }
320 tp->snd_nxt = tp->snd_una;
321 #if defined(TCP_SACK)
322 /*
323 * Note: We overload snd_last to function also as the
324 * snd_last variable described in RFC 2582
325 */
326 tp->snd_last = tp->snd_max;
327 #endif /* TCP_SACK */
328 /*
329 * If timing a segment in this window, stop the timer.
330 */
331 tp->t_rtttime = 0;
332 #ifdef TCP_ECN
333 /*
334 * if ECN is enabled, there might be a broken firewall which
335 * blocks ecn packets. fall back to non-ecn.
336 */
337 if ((tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED)
338 && tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
339 tp->t_flags |= TF_DISABLE_ECN;
340 #endif
341 /*
342 * Close the congestion window down to one segment
343 * (we'll open it by one segment for each ack we get).
344 * Since we probably have a window's worth of unacked
345 * data accumulated, this "slow start" keeps us from
346 * dumping all that data as back-to-back packets (which
347 * might overwhelm an intermediate gateway).
348 *
349 * There are two phases to the opening: Initially we
350 * open by one mss on each ack. This makes the window
351 * size increase exponentially with time. If the
352 * window is larger than the path can handle, this
353 * exponential growth results in dropped packet(s)
354 * almost immediately. To get more time between
355 * drops but still "push" the network to take advantage
356 * of improving conditions, we switch from exponential
357 * to linear window opening at some threshhold size.
358 * For a threshhold, we use half the current window
359 * size, truncated to a multiple of the mss.
360 *
361 * (the minimum cwnd that will give us exponential
362 * growth is 2 mss. We don't allow the threshhold
363 * to go below this.)
364 */
365 {
366 u_long win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
367 if (win < 2)
368 win = 2;
369 tp->snd_cwnd = tp->t_maxseg;
370 tp->snd_ssthresh = win * tp->t_maxseg;
371 tp->t_dupacks = 0;
372 #ifdef TCP_ECN
373 tp->snd_last = tp->snd_max;
374 tp->t_flags |= TF_SEND_CWR;
375 #endif
376 #if 1 /* TCP_ECN */
377 tcpstat.tcps_cwr_timeout++;
378 #endif
379 }
380 (void) tcp_output(tp);
381
382 out:
383 splx(s);
384 }
385
386 void
tcp_timer_persist(void * arg)387 tcp_timer_persist(void *arg)
388 {
389 struct tcpcb *tp = arg;
390 uint32_t rto;
391 int s;
392
393 s = splsoftnet();
394 if ((tp->t_flags & TF_DEAD) ||
395 TCP_TIMER_ISARMED(tp, TCPT_REXMT)) {
396 splx(s);
397 return;
398 }
399 tcpstat.tcps_persisttimeo++;
400 /*
401 * Hack: if the peer is dead/unreachable, we do not
402 * time out if the window is closed. After a full
403 * backoff, drop the connection if the idle time
404 * (no responses to probes) reaches the maximum
405 * backoff that we would use if retransmitting.
406 */
407 rto = TCP_REXMTVAL(tp);
408 if (rto < tp->t_rttmin)
409 rto = tp->t_rttmin;
410 if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
411 ((tcp_now - tp->t_rcvtime) >= tcp_maxpersistidle ||
412 (tcp_now - tp->t_rcvtime) >= rto * tcp_totbackoff)) {
413 tcpstat.tcps_persistdrop++;
414 tp = tcp_drop(tp, ETIMEDOUT);
415 goto out;
416 }
417 tcp_setpersist(tp);
418 tp->t_force = 1;
419 (void) tcp_output(tp);
420 tp->t_force = 0;
421 out:
422 splx(s);
423 }
424
425 void
tcp_timer_keep(void * arg)426 tcp_timer_keep(void *arg)
427 {
428 struct tcpcb *tp = arg;
429 int s;
430
431 s = splsoftnet();
432 if (tp->t_flags & TF_DEAD) {
433 splx(s);
434 return;
435 }
436
437 tcpstat.tcps_keeptimeo++;
438 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
439 goto dropit;
440 if (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE &&
441 tp->t_state <= TCPS_CLOSING) {
442 if ((tcp_maxidle > 0) &&
443 ((tcp_now - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle))
444 goto dropit;
445 /*
446 * Send a packet designed to force a response
447 * if the peer is up and reachable:
448 * either an ACK if the connection is still alive,
449 * or an RST if the peer has closed the connection
450 * due to timeout or reboot.
451 * Using sequence number tp->snd_una-1
452 * causes the transmitted zero-length segment
453 * to lie outside the receive window;
454 * by the protocol spec, this requires the
455 * correspondent TCP to respond.
456 */
457 tcpstat.tcps_keepprobe++;
458 #ifdef TCP_COMPAT_42
459 /*
460 * The keepalive packet must have nonzero length
461 * to get a 4.2 host to respond.
462 */
463 tcp_respond(tp, mtod(tp->t_template, caddr_t),
464 NULL, tp->rcv_nxt - 1, tp->snd_una - 1, 0);
465 #else
466 tcp_respond(tp, mtod(tp->t_template, caddr_t),
467 NULL, tp->rcv_nxt, tp->snd_una - 1, 0);
468 #endif
469 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
470 } else
471 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
472
473 splx(s);
474 return;
475
476 dropit:
477 tcpstat.tcps_keepdrops++;
478 tp = tcp_drop(tp, ETIMEDOUT);
479
480 splx(s);
481 }
482
483 void
tcp_timer_2msl(void * arg)484 tcp_timer_2msl(void *arg)
485 {
486 struct tcpcb *tp = arg;
487 int s;
488
489 s = splsoftnet();
490 if (tp->t_flags & TF_DEAD) {
491 splx(s);
492 return;
493 }
494
495 #ifdef TCP_SACK
496 tcp_timer_freesack(tp);
497 #endif
498
499 if (tp->t_state != TCPS_TIME_WAIT &&
500 ((tcp_maxidle == 0) || ((tcp_now - tp->t_rcvtime) <= tcp_maxidle)))
501 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
502 else
503 tp = tcp_close(tp);
504
505 splx(s);
506 }
507