xref: /dragonfly/sys/netinet/tcp_input.c (revision 2ba12c9ccce24e5186e60a8be059faf55cd27e21)
1 /*
2  * Copyright (c) 2002, 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2002, 2003, 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
36  *        The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *        @(#)tcp_input.c     8.12 (Berkeley) 5/24/95
63  * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.38 2003/05/21 04:46:41 cjc Exp $
64  */
65 
66 #include "opt_inet.h"
67 #include "opt_inet6.h"
68 #include "opt_tcpdebug.h"
69 #include "opt_tcp_input.h"
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/sysctl.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/proc.h>                   /* for proc0 declaration */
78 #include <sys/protosw.h>
79 #include <sys/socket.h>
80 #include <sys/socketvar.h>
81 #include <sys/syslog.h>
82 #include <sys/in_cksum.h>
83 
84 #include <sys/socketvar2.h>
85 
86 #include <machine/cpu.h>      /* before tcp_seq.h, for tcp_random18() */
87 #include <machine/stdarg.h>
88 
89 #include <net/if.h>
90 #include <net/route.h>
91 
92 #include <netinet/in.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/ip.h>
95 #include <netinet/ip_icmp.h>  /* for ICMP_BANDLIM */
96 #include <netinet/in_var.h>
97 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
98 #include <netinet/in_pcb.h>
99 #include <netinet/ip_var.h>
100 #include <netinet/ip6.h>
101 #include <netinet/icmp6.h>
102 #include <netinet6/nd6.h>
103 #include <netinet6/ip6_var.h>
104 #include <netinet6/in6_pcb.h>
105 #include <netinet/tcp.h>
106 #include <netinet/tcp_fsm.h>
107 #include <netinet/tcp_seq.h>
108 #include <netinet/tcp_timer.h>
109 #include <netinet/tcp_timer2.h>
110 #include <netinet/tcp_var.h>
111 #include <netinet6/tcp6_var.h>
112 #include <netinet/tcpip.h>
113 
114 #ifdef TCPDEBUG
115 #include <netinet/tcp_debug.h>
116 
117 u_char tcp_saveipgen[40];    /* the size must be of max ip header, now IPv6 */
118 struct tcphdr tcp_savetcp;
119 #endif
120 
121 /*
122  * Limit burst of new packets during SACK based fast recovery
123  * or extended limited transmit.
124  */
125 #define TCP_SACK_MAXBURST     4
126 
127 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
128 
129 static int log_in_vain = 0;
130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
131     &log_in_vain, 0, "Log all incoming TCP connections");
132 
133 static int blackhole = 0;
134 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
135     &blackhole, 0, "Do not send RST when dropping refused connections");
136 
137 int tcp_delack_enabled = 1;
138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
139     &tcp_delack_enabled, 0,
140     "Delay ACK to try to piggyback it onto a data packet");
141 
142 #ifdef TCP_DROP_SYNFIN
143 static int drop_synfin = 0;
144 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
145     &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
146 #endif
147 
148 static int tcp_do_limitedtransmit = 1;
149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, limitedtransmit, CTLFLAG_RW,
150     &tcp_do_limitedtransmit, 0, "Enable RFC 3042 (Limited Transmit)");
151 
152 static int tcp_do_early_retransmit = 1;
153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, earlyretransmit, CTLFLAG_RW,
154     &tcp_do_early_retransmit, 0, "Early retransmit");
155 
156 int tcp_aggregate_acks = 1;
157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, aggregate_acks, CTLFLAG_RW,
158     &tcp_aggregate_acks, 0, "Aggregate built-up acks into one ack");
159 
160 static int tcp_do_eifel_detect = 1;
161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel, CTLFLAG_RW,
162     &tcp_do_eifel_detect, 0, "Eifel detection algorithm (RFC 3522)");
163 
164 static int tcp_do_abc = 1;
165 SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc, CTLFLAG_RW,
166     &tcp_do_abc, 0,
167     "TCP Appropriate Byte Counting (RFC 3465)");
168 
169 /*
170  * The following value actually takes range [25ms, 250ms],
171  * given that most modern systems use 1ms ~ 10ms as the unit
172  * of timestamp option.
173  */
174 static u_int tcp_paws_tolerance = 25;
175 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, paws_tolerance, CTLFLAG_RW,
176     &tcp_paws_tolerance, 0, "RFC1323 PAWS tolerance");
177 
178 /*
179  * Define as tunable for easy testing with SACK on and off.
180  * Warning:  do not change setting in the middle of an existing active TCP flow,
181  *   else strange things might happen to that flow.
182  */
183 int tcp_do_sack = 1;
184 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW,
185     &tcp_do_sack, 0, "Enable SACK Algorithms");
186 
187 int tcp_do_smartsack = 1;
188 SYSCTL_INT(_net_inet_tcp, OID_AUTO, smartsack, CTLFLAG_RW,
189     &tcp_do_smartsack, 0, "Enable Smart SACK Algorithms");
190 
191 int tcp_do_rescuesack = 1;
192 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack, CTLFLAG_RW,
193     &tcp_do_rescuesack, 0, "Rescue retransmission for SACK");
194 
195 int tcp_aggressive_rescuesack = 0;
196 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW,
197     &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK");
198 
199 static int tcp_force_sackrxt = 1;
200 SYSCTL_INT(_net_inet_tcp, OID_AUTO, force_sackrxt, CTLFLAG_RW,
201     &tcp_force_sackrxt, 0, "Allowed forced SACK retransmit burst");
202 
203 int tcp_do_rfc6675 = 1;
204 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675, CTLFLAG_RW,
205     &tcp_do_rfc6675, 0, "Enable RFC6675");
206 
207 int tcp_rfc6675_rxt = 0;
208 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_rxt, CTLFLAG_RW,
209     &tcp_rfc6675_rxt, 0, "Enable RFC6675 retransmit");
210 
211 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
212     "TCP Segment Reassembly Queue");
213 
214 int tcp_reass_maxseg = 0;
215 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RD,
216     &tcp_reass_maxseg, 0,
217     "Global maximum number of TCP Segments in Reassembly Queue");
218 
219 int tcp_reass_qsize = 0;
220 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
221     &tcp_reass_qsize, 0,
222     "Global number of TCP Segments currently in Reassembly Queue");
223 
224 static int tcp_reass_overflows = 0;
225 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
226     &tcp_reass_overflows, 0,
227     "Global number of TCP Segment Reassembly Queue Overflows");
228 
229 int tcp_do_autorcvbuf = 1;
230 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
231     &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
232 
233 int tcp_autorcvbuf_inc = 16*1024;
234 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
235     &tcp_autorcvbuf_inc, 0,
236     "Incrementor step size of automatic receive buffer");
237 
238 int tcp_autorcvbuf_max = 2*1024*1024;
239 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
240     &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
241 
242 int tcp_sosend_agglim = 2;
243 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_agglim, CTLFLAG_RW,
244     &tcp_sosend_agglim, 0, "TCP sosend mbuf aggregation limit");
245 
246 int tcp_sosend_async = 1;
247 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_async, CTLFLAG_RW,
248     &tcp_sosend_async, 0, "TCP asynchronized pru_send");
249 
250 int tcp_sosend_jcluster = 1;
251 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_jcluster, CTLFLAG_RW,
252     &tcp_sosend_jcluster, 0, "TCP output uses jcluster");
253 
254 static int tcp_ignore_redun_dsack = 1;
255 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ignore_redun_dsack, CTLFLAG_RW,
256     &tcp_ignore_redun_dsack, 0, "Ignore redundant DSACK");
257 
258 static int tcp_reuseport_ext = 1;
259 SYSCTL_INT(_net_inet_tcp, OID_AUTO, reuseport_ext, CTLFLAG_RW,
260     &tcp_reuseport_ext, 0, "SO_REUSEPORT extension");
261 
262 static void          tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t,
263                         tcp_seq);
264 static void          tcp_pulloutofband(struct socket *,
265                          struct tcphdr *, struct mbuf *, int);
266 static int           tcp_reass(struct tcpcb *, struct tcphdr *, int *,
267                          struct mbuf *);
268 static void          tcp_xmit_timer(struct tcpcb *, int, tcp_seq);
269 static void          tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int);
270 static void          tcp_sack_rexmt(struct tcpcb *, boolean_t);
271 static boolean_t tcp_sack_limitedxmit(struct tcpcb *);
272 static int           tcp_rmx_msl(const struct tcpcb *);
273 static void          tcp_established(struct tcpcb *);
274 static boolean_t tcp_recv_dupack(struct tcpcb *, tcp_seq, u_int);
275 
276 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
277 #ifdef INET6
278 #define ND6_HINT(tp) \
279 do { \
280           if ((tp) && (tp)->t_inpcb && \
281               INP_ISIPV6((tp)->t_inpcb) && \
282               (tp)->t_inpcb->in6p_route.ro_rt) \
283                     nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
284 } while (0)
285 #else
286 #define ND6_HINT(tp)
287 #endif
288 
289 /*
290  * Indicate whether this ack should be delayed.  We can delay the ack if
291  *        - delayed acks are enabled and
292  *        - there is no delayed ack timer in progress and
293  *        - our last ack wasn't a 0-sized window.  We never want to delay
294  *          the ack that opens up a 0-sized window.
295  */
296 #define DELAY_ACK(tp) \
297           (tcp_delack_enabled && !tcp_callout_pending(tp, tp->tt_delack) && \
298           !(tp->t_flags & TF_RXWIN0SENT))
299 
300 #define acceptable_window_update(tp, th, tiwin)                                 \
301     (SEQ_LT(tp->snd_wl1, th->th_seq) ||                                         \
302      (tp->snd_wl1 == th->th_seq &&                                              \
303       (SEQ_LT(tp->snd_wl2, th->th_ack) ||                                       \
304        (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))
305 
306 #define   iceildiv(n, d)                (((n)+(d)-1) / (d))
307 #define need_early_retransmit(tp, ownd) \
308     (tcp_do_early_retransmit && \
309      (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) && \
310      ownd < ((tp->t_rxtthresh + 1) * tp->t_maxseg) && \
311      tp->t_dupacks + 1 >= iceildiv(ownd, tp->t_maxseg) && \
312      (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \
313       tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg)))
314 
315 /*
316  * Returns TRUE, if this segment can be merged with the last
317  * pending segment in the reassemble queue and this segment
318  * does not overlap with the pending segment immediately
319  * preceeding the last pending segment.
320  */
321 static __inline boolean_t
tcp_paws_canreasslast(const struct tcpcb * tp,const struct tcphdr * th,int tlen)322 tcp_paws_canreasslast(const struct tcpcb *tp, const struct tcphdr *th, int tlen)
323 {
324           const struct tseg_qent *last, *prev;
325 
326           last = TAILQ_LAST(&tp->t_segq, tsegqe_head);
327           if (last == NULL)
328                     return FALSE;
329 
330           /* This segment comes immediately after the last pending segment */
331           if (last->tqe_th->th_seq + last->tqe_len == th->th_seq) {
332                     if (last->tqe_th->th_flags & TH_FIN) {
333                               /* No segments should follow segment w/ FIN */
334                               return FALSE;
335                     }
336                     return TRUE;
337           }
338 
339           if (th->th_seq + tlen != last->tqe_th->th_seq)
340                     return FALSE;
341           /* This segment comes immediately before the last pending segment */
342 
343           prev = TAILQ_PREV(last, tsegqe_head, tqe_q);
344           if (prev == NULL) {
345                     /*
346                      * No pending preceeding segment, we assume this segment
347                      * could be reassembled.
348                      */
349                     return TRUE;
350           }
351 
352           /* This segment does not overlap with the preceeding segment */
353           if (SEQ_GEQ(th->th_seq, prev->tqe_th->th_seq + prev->tqe_len))
354                     return TRUE;
355 
356           return FALSE;
357 }
358 
359 static __inline void
tcp_ncr_update_rxtthresh(struct tcpcb * tp)360 tcp_ncr_update_rxtthresh(struct tcpcb *tp)
361 {
362           int old_rxtthresh = tp->t_rxtthresh;
363           uint32_t ownd = tp->snd_max - tp->snd_una;
364 
365           tp->t_rxtthresh = min(tcp_ncr_rxtthresh_max,
366               max(tcprexmtthresh, ((ownd / tp->t_maxseg) >> 1)));
367           if (tp->t_rxtthresh != old_rxtthresh) {
368                     tcp_sack_update_lostseq(&tp->scb, tp->snd_una,
369                         tp->t_maxseg, tp->t_rxtthresh);
370           }
371 }
372 
373 static int
tcp_reass(struct tcpcb * tp,struct tcphdr * th,int * tlenp,struct mbuf * m)374 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
375 {
376           struct tseg_qent *q;
377           struct tseg_qent *p = NULL;
378           struct tseg_qent *te;
379           struct socket *so = tp->t_inpcb->inp_socket;
380           int flags;
381 
382           /*
383            * Call with th == NULL after become established to
384            * force pre-ESTABLISHED data up to user socket.
385            */
386           if (th == NULL)
387                     goto present;
388 
389           /*
390            * Limit the number of segments in the reassembly queue to prevent
391            * holding on to too many segments (and thus running out of mbufs).
392            * Make sure to let the missing segment through which caused this
393            * queue.  Always keep one global queue entry spare to be able to
394            * process the missing segment.
395            */
396           if (th->th_seq != tp->rcv_nxt &&
397               tcp_reass_qsize + 1 >= tcp_reass_maxseg) {
398                     tcp_reass_overflows++;
399                     tcpstat.tcps_rcvmemdrop++;
400                     m_freem(m);
401                     /* no SACK block to report */
402                     tp->reportblk.rblk_start = tp->reportblk.rblk_end;
403                     return (0);
404           }
405 
406           /* Allocate a new queue entry. */
407           te = kmalloc(sizeof(struct tseg_qent), M_TSEGQ, M_INTWAIT | M_NULLOK);
408           if (te == NULL) {
409                     tcpstat.tcps_rcvmemdrop++;
410                     m_freem(m);
411                     /* no SACK block to report */
412                     tp->reportblk.rblk_start = tp->reportblk.rblk_end;
413                     return (0);
414           }
415           atomic_add_int(&tcp_reass_qsize, 1);
416 
417           if (th->th_flags & TH_FIN)
418                     tp->t_flags |= TF_QUEDFIN;
419 
420           /*
421            * Find a segment which begins after this one does.
422            */
423           TAILQ_FOREACH(q, &tp->t_segq, tqe_q) {
424                     if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
425                               break;
426                     p = q;
427           }
428 
429           /*
430            * If there is a preceding segment, it may provide some of
431            * our data already.  If so, drop the data from the incoming
432            * segment.  If it provides all of our data, drop us.
433            */
434           if (p != NULL) {
435                     tcp_seq_diff_t i;
436 
437                     /* conversion to int (in i) handles seq wraparound */
438                     i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
439                     if (i > 0) {                  /* overlaps preceding segment */
440                               tp->sack_flags |=
441                                   (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
442                               /* enclosing block starts w/ preceding segment */
443                               tp->encloseblk.rblk_start = p->tqe_th->th_seq;
444                               if (i >= *tlenp) {
445                                         if (th->th_flags & TH_FIN)
446                                                   p->tqe_th->th_flags |= TH_FIN;
447 
448                                         /* preceding encloses incoming segment */
449                                         tp->encloseblk.rblk_end = TCP_SACK_BLKEND(
450                                             p->tqe_th->th_seq + p->tqe_len,
451                                             p->tqe_th->th_flags);
452                                         tcpstat.tcps_rcvduppack++;
453                                         tcpstat.tcps_rcvdupbyte += *tlenp;
454                                         m_freem(m);
455                                         kfree(te, M_TSEGQ);
456                                         atomic_add_int(&tcp_reass_qsize, -1);
457                                         /*
458                                          * Try to present any queued data
459                                          * at the left window edge to the user.
460                                          * This is needed after the 3-WHS
461                                          * completes.
462                                          */
463                                         goto present;       /* ??? */
464                               }
465                               m_adj(m, i);
466                               *tlenp -= i;
467                               th->th_seq += i;
468                               /* incoming segment end is enclosing block end */
469                               tp->encloseblk.rblk_end = TCP_SACK_BLKEND(
470                                   th->th_seq + *tlenp, th->th_flags);
471                               /* trim end of reported D-SACK block */
472                               tp->reportblk.rblk_end = th->th_seq;
473                     }
474           }
475           tcpstat.tcps_rcvoopack++;
476           tcpstat.tcps_rcvoobyte += *tlenp;
477 
478           /*
479            * While we overlap succeeding segments trim them or,
480            * if they are completely covered, dequeue them.
481            */
482           while (q) {
483                     tcp_seq_diff_t i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
484                     tcp_seq qend = q->tqe_th->th_seq + q->tqe_len;
485                     tcp_seq qend_sack = TCP_SACK_BLKEND(qend, q->tqe_th->th_flags);
486                     struct tseg_qent *nq;
487 
488                     if (i <= 0)
489                               break;
490                     if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
491                               /* first time through */
492                               tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
493                               tp->encloseblk = tp->reportblk;
494                               /* report trailing duplicate D-SACK segment */
495                               tp->reportblk.rblk_start = q->tqe_th->th_seq;
496                     }
497                     if ((tp->sack_flags & TSACK_F_ENCLOSESEG) &&
498                         SEQ_GT(qend_sack, tp->encloseblk.rblk_end)) {
499                               /* extend enclosing block if one exists */
500                               tp->encloseblk.rblk_end = qend_sack;
501                     }
502                     if (i < q->tqe_len) {
503                               q->tqe_th->th_seq += i;
504                               q->tqe_len -= i;
505                               m_adj(q->tqe_m, i);
506                               break;
507                     }
508 
509                     if (q->tqe_th->th_flags & TH_FIN)
510                               th->th_flags |= TH_FIN;
511 
512                     nq = TAILQ_NEXT(q, tqe_q);
513                     TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
514                     m_freem(q->tqe_m);
515                     kfree(q, M_TSEGQ);
516                     atomic_add_int(&tcp_reass_qsize, -1);
517                     q = nq;
518           }
519 
520           /* Insert the new segment queue entry into place. */
521           te->tqe_m = m;
522           te->tqe_th = th;
523           te->tqe_len = *tlenp;
524 
525           /* check if can coalesce with following segment */
526           if (q != NULL && (th->th_seq + *tlenp == q->tqe_th->th_seq)) {
527                     tcp_seq tend_sack;
528 
529                     te->tqe_len += q->tqe_len;
530                     if (q->tqe_th->th_flags & TH_FIN)
531                               te->tqe_th->th_flags |= TH_FIN;
532                     tend_sack = TCP_SACK_BLKEND(te->tqe_th->th_seq + te->tqe_len,
533                         te->tqe_th->th_flags);
534 
535                     m_cat(te->tqe_m, q->tqe_m);
536                     tp->encloseblk.rblk_end = tend_sack;
537                     /*
538                      * When not reporting a duplicate segment, use
539                      * the larger enclosing block as the SACK block.
540                      */
541                     if (!(tp->sack_flags & TSACK_F_DUPSEG))
542                               tp->reportblk.rblk_end = tend_sack;
543                     TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
544                     kfree(q, M_TSEGQ);
545                     atomic_add_int(&tcp_reass_qsize, -1);
546           }
547 
548           if (p == NULL) {
549                     TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q);
550           } else {
551                     /* check if can coalesce with preceding segment */
552                     if (p->tqe_th->th_seq + p->tqe_len == th->th_seq) {
553                               if (te->tqe_th->th_flags & TH_FIN)
554                                         p->tqe_th->th_flags |= TH_FIN;
555                               p->tqe_len += te->tqe_len;
556                               m_cat(p->tqe_m, te->tqe_m);
557                               tp->encloseblk.rblk_start = p->tqe_th->th_seq;
558                               /*
559                                * When not reporting a duplicate segment, use
560                                * the larger enclosing block as the SACK block.
561                                */
562                               if (!(tp->sack_flags & TSACK_F_DUPSEG))
563                                         tp->reportblk.rblk_start = p->tqe_th->th_seq;
564                               kfree(te, M_TSEGQ);
565                               atomic_add_int(&tcp_reass_qsize, -1);
566                     } else {
567                               TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q);
568                     }
569           }
570 
571 present:
572           /*
573            * Present data to user, advancing rcv_nxt through
574            * completed sequence space.
575            */
576           if (!TCPS_HAVEESTABLISHED(tp->t_state))
577                     return (0);
578           q = TAILQ_FIRST(&tp->t_segq);
579           if (q == NULL || q->tqe_th->th_seq != tp->rcv_nxt)
580                     return (0);
581           tp->rcv_nxt += q->tqe_len;
582           if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
583                     /* no SACK block to report since ACK advanced */
584                     tp->reportblk.rblk_start = tp->reportblk.rblk_end;
585           }
586           /* no enclosing block to report since ACK advanced */
587           tp->sack_flags &= ~TSACK_F_ENCLOSESEG;
588           flags = q->tqe_th->th_flags & TH_FIN;
589           TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
590           KASSERT(TAILQ_EMPTY(&tp->t_segq) ||
591                     TAILQ_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt,
592                     ("segment not coalesced"));
593           if (so->so_state & SS_CANTRCVMORE) {
594                     m_freem(q->tqe_m);
595           } else {
596                     lwkt_gettoken(&so->so_rcv.ssb_token);
597                     ssb_appendstream(&so->so_rcv, q->tqe_m);
598                     lwkt_reltoken(&so->so_rcv.ssb_token);
599           }
600           kfree(q, M_TSEGQ);
601           atomic_add_int(&tcp_reass_qsize, -1);
602           ND6_HINT(tp);
603           sorwakeup(so);
604           return (flags);
605 }
606 
607 /*
608  * TCP input routine, follows pages 65-76 of the
609  * protocol specification dated September, 1981 very closely.
610  */
611 #ifdef INET6
612 int
tcp6_input(struct mbuf ** mp,int * offp,int proto)613 tcp6_input(struct mbuf **mp, int *offp, int proto)
614 {
615           struct mbuf *m = *mp;
616           struct in6_ifaddr *ia6;
617 
618           IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
619 
620           /*
621            * draft-itojun-ipv6-tcp-to-anycast
622            * better place to put this in?
623            */
624           ia6 = ip6_getdstifaddr(m);
625           if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
626                     icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
627                                   offsetof(struct ip6_hdr, ip6_dst));
628                     return (IPPROTO_DONE);
629           }
630 
631           tcp_input(mp, offp, proto);
632           return (IPPROTO_DONE);
633 }
634 #endif
635 
636 int
tcp_input(struct mbuf ** mp,int * offp,int proto)637 tcp_input(struct mbuf **mp, int *offp, int proto)
638 {
639           int off0;
640           struct tcphdr *th;
641           struct ip *ip = NULL;
642           struct ipovly *ipov;
643           struct inpcb *inp = NULL;
644           u_char *optp = NULL;
645           int optlen = 0;
646           int tlen, off;
647           int len = 0;
648           int drop_hdrlen;
649           struct tcpcb *tp = NULL;
650           int thflags;
651           struct socket *so = NULL;
652           int todrop, acked;
653           boolean_t ourfinisacked, needoutput = FALSE, delayed_dupack = FALSE;
654           tcp_seq th_dupack = 0; /* XXX gcc warning */
655           u_int to_flags = 0; /* XXX gcc warning */
656           u_long tiwin;
657           int recvwin;
658           struct tcpopt to;             /* options in this segment */
659           struct sockaddr_in *next_hop = NULL;
660           int rstreason; /* For badport_bandlim accounting purposes */
661           int cpu;
662           struct ip6_hdr *ip6 = NULL;
663           struct mbuf *m;
664 #ifdef INET6
665           boolean_t isipv6;
666 #else
667           const boolean_t isipv6 = FALSE;
668 #endif
669 #ifdef TCPDEBUG
670           short ostate = 0;
671 #endif
672 
673           off0 = *offp;
674           m = *mp;
675           *mp = NULL;
676 
677           tcpstat.tcps_rcvtotal++;
678 
679           if (m->m_pkthdr.fw_flags & IPFORWARD_MBUF_TAGGED) {
680                     struct m_tag *mtag;
681 
682                     mtag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
683                     KKASSERT(mtag != NULL);
684                     next_hop = m_tag_data(mtag);
685           }
686 
687 #ifdef INET6
688           isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? TRUE : FALSE;
689 #endif
690 
691           if (isipv6) {
692                     /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
693                     ip6 = mtod(m, struct ip6_hdr *);
694                     tlen = (sizeof *ip6) + ntohs(ip6->ip6_plen) - off0;
695                     if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
696                               tcpstat.tcps_rcvbadsum++;
697                               goto drop;
698                     }
699                     th = (struct tcphdr *)((caddr_t)ip6 + off0);
700 
701                     /*
702                      * Be proactive about unspecified IPv6 address in source.
703                      * As we use all-zero to indicate unbounded/unconnected pcb,
704                      * unspecified IPv6 address can be used to confuse us.
705                      *
706                      * Note that packets with unspecified IPv6 destination is
707                      * already dropped in ip6_input.
708                      */
709                     if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
710                               /* XXX stat */
711                               goto drop;
712                     }
713           } else {
714                     /*
715                      * Get IP and TCP header together in first mbuf.
716                      * Note: IP leaves IP header in first mbuf.
717                      */
718                     if (off0 > sizeof(struct ip)) {
719                               ip_stripoptions(m);
720                               off0 = sizeof(struct ip);
721                     }
722                     /* already checked and pulled up in ip_demux() */
723                     KASSERT(m->m_len >= sizeof(struct tcpiphdr),
724                         ("TCP header not in one mbuf: m->m_len %d", m->m_len));
725                     ip = mtod(m, struct ip *);
726                     ipov = (struct ipovly *)ip;
727                     th = (struct tcphdr *)((caddr_t)ip + off0);
728                     tlen = ntohs(ip->ip_len) - off0;
729 
730                     if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
731                               if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
732                                         th->th_sum = m->m_pkthdr.csum_data;
733                               else
734                                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
735                                                             ip->ip_dst.s_addr,
736                                                             htonl(m->m_pkthdr.csum_data +
737                                                                       ntohs(ip->ip_len) +
738                                                                       IPPROTO_TCP));
739                               th->th_sum ^= 0xffff;
740                     } else {
741                               /*
742                                * Checksum extended TCP header and data.
743                                */
744                               len = sizeof(struct ip) + tlen;
745                               bzero(ipov->ih_x1, sizeof ipov->ih_x1);
746                               ipov->ih_len = (u_short)tlen;
747                               ipov->ih_len = htons(ipov->ih_len);
748                               th->th_sum = in_cksum(m, len);
749                     }
750                     if (th->th_sum) {
751                               tcpstat.tcps_rcvbadsum++;
752                               goto drop;
753                     }
754 #ifdef INET6
755                     /* Re-initialization for later version check */
756                     ip->ip_v = IPVERSION;
757 #endif
758           }
759 
760           /*
761            * Check that TCP offset makes sense,
762            * pull out TCP options and adjust length.                  XXX
763            */
764           off = th->th_off << 2;
765           /* already checked and pulled up in ip_demux() */
766           KASSERT(off >= sizeof(struct tcphdr) && off <= tlen,
767               ("bad TCP data offset %d (tlen %d)", off, tlen));
768           tlen -= off;        /* tlen is used instead of ti->ti_len */
769           if (off > sizeof(struct tcphdr)) {
770                     if (isipv6) {
771                               IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE);
772                               ip6 = mtod(m, struct ip6_hdr *);
773                               th = (struct tcphdr *)((caddr_t)ip6 + off0);
774                     } else {
775                               /* already pulled up in ip_demux() */
776                               KASSERT(m->m_len >= sizeof(struct ip) + off,
777                                   ("TCP header and options not in one mbuf: "
778                                    "m_len %d, off %d", m->m_len, off));
779                     }
780                     optlen = off - sizeof(struct tcphdr);
781                     optp = (u_char *)(th + 1);
782           }
783           thflags = th->th_flags;
784 
785 #ifdef TCP_DROP_SYNFIN
786           /*
787            * If the drop_synfin option is enabled, drop all packets with
788            * both the SYN and FIN bits set. This prevents e.g. nmap from
789            * identifying the TCP/IP stack.
790            *
791            * This is a violation of the TCP specification.
792            */
793           if (drop_synfin && (thflags & (TH_SYN | TH_FIN)) == (TH_SYN | TH_FIN))
794                     goto drop;
795 #endif
796 
797           /*
798            * Convert TCP protocol specific fields to host format.
799            */
800           th->th_seq = ntohl(th->th_seq);
801           th->th_ack = ntohl(th->th_ack);
802           th->th_win = ntohs(th->th_win);
803           th->th_urp = ntohs(th->th_urp);
804 
805           /*
806            * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
807            * until after ip6_savecontrol() is called and before other functions
808            * which don't want those proto headers.
809            * Because ip6_savecontrol() is going to parse the mbuf to
810            * search for data to be passed up to user-land, it wants mbuf
811            * parameters to be unchanged.
812            * XXX: the call of ip6_savecontrol() has been obsoleted based on
813            * latest version of the advanced API (20020110).
814            */
815           drop_hdrlen = off0 + off;
816 
817           /*
818            * Locate pcb for segment.
819            */
820 findpcb:
821           /* IPFIREWALL_FORWARD section */
822           if (next_hop != NULL && !isipv6) {  /* IPv6 support is not there yet */
823                     /*
824                      * Transparently forwarded. Pretend to be the destination.
825                      * already got one like this?
826                      */
827                     cpu = mycpu->gd_cpuid;
828                     inp = in_pcblookup_hash(&tcbinfo[cpu],
829                                                   ip->ip_src, th->th_sport,
830                                                   ip->ip_dst, th->th_dport,
831                                                   0, m->m_pkthdr.rcvif);
832                     if (!inp) {
833                               /*
834                                * It's new.  Try to find the ambushing socket.
835                                */
836 
837                               /*
838                                * The rest of the ipfw code stores the port in
839                                * host order.  XXX
840                                * (The IP address is still in network order.)
841                                */
842                               in_port_t dport = next_hop->sin_port ?
843                                                             htons(next_hop->sin_port) :
844                                                             th->th_dport;
845 
846                               cpu = tcp_addrcpu(ip->ip_src.s_addr, th->th_sport,
847                                                     next_hop->sin_addr.s_addr, dport);
848                               inp = in_pcblookup_hash(&tcbinfo[cpu],
849                                                             ip->ip_src, th->th_sport,
850                                                             next_hop->sin_addr, dport,
851                                                             1, m->m_pkthdr.rcvif);
852                     }
853           } else {
854                     if (isipv6) {
855                               inp = in6_pcblookup_hash(&tcbinfo[0],
856                                                              &ip6->ip6_src, th->th_sport,
857                                                              &ip6->ip6_dst, th->th_dport,
858                                                              1, m->m_pkthdr.rcvif);
859                     } else {
860                               cpu = mycpu->gd_cpuid;
861                               inp = in_pcblookup_pkthash(&tcbinfo[cpu],
862                                                       ip->ip_src, th->th_sport,
863                                                       ip->ip_dst, th->th_dport,
864                                                       1, m->m_pkthdr.rcvif,
865                                                       tcp_reuseport_ext ? m : NULL);
866                     }
867           }
868 
869           /*
870            * If the state is CLOSED (i.e., TCB does not exist) then
871            * all data in the incoming segment is discarded.
872            * If the TCB exists but is in CLOSED state, it is embryonic,
873            * but should either do a listen or a connect soon.
874            */
875           if (inp == NULL) {
876                     if (log_in_vain) {
877 #ifdef INET6
878                               char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2];
879 #else
880                               char dbuf[INET_ADDRSTRLEN], sbuf[INET_ADDRSTRLEN];
881 #endif
882                               if (isipv6) {
883                                         strcpy(dbuf, "[");
884                                         strcat(dbuf, ip6_sprintf(&ip6->ip6_dst));
885                                         strcat(dbuf, "]");
886                                         strcpy(sbuf, "[");
887                                         strcat(sbuf, ip6_sprintf(&ip6->ip6_src));
888                                         strcat(sbuf, "]");
889                               } else {
890                                         kinet_ntoa(ip->ip_dst, dbuf);
891                                         kinet_ntoa(ip->ip_src, sbuf);
892                               }
893                               switch (log_in_vain) {
894                               case 1:
895                                         if (!(thflags & TH_SYN))
896                                                   break;
897                               case 2:
898                                         log(LOG_INFO,
899                                             "Connection attempt to TCP %s:%d "
900                                             "from %s:%d flags:0x%02x\n",
901                                             dbuf, ntohs(th->th_dport), sbuf,
902                                             ntohs(th->th_sport), thflags);
903                                         break;
904                               default:
905                                         break;
906                               }
907                     }
908                     if (blackhole) {
909                               switch (blackhole) {
910                               case 1:
911                                         if (thflags & TH_SYN)
912                                                   goto drop;
913                                         break;
914                               case 2:
915                                         goto drop;
916                               default:
917                                         goto drop;
918                               }
919                     }
920                     rstreason = BANDLIM_RST_CLOSEDPORT;
921                     goto dropwithreset;
922           }
923 
924           /* Check the minimum TTL for socket. */
925 #ifdef INET6
926           if ((isipv6 ? ip6->ip6_hlim : ip->ip_ttl) < inp->inp_ip_minttl)
927                     goto drop;
928 #endif
929 
930           tp = intotcpcb(inp);
931           KASSERT(tp != NULL, ("tcp_input: tp is NULL"));
932           if (tp->t_state <= TCPS_CLOSED)
933                     goto drop;
934 
935           so = inp->inp_socket;
936 
937 #ifdef TCPDEBUG
938           if (so->so_options & SO_DEBUG) {
939                     ostate = tp->t_state;
940                     if (isipv6)
941                               bcopy(ip6, tcp_saveipgen, sizeof(*ip6));
942                     else
943                               bcopy(ip, tcp_saveipgen, sizeof(*ip));
944                     tcp_savetcp = *th;
945           }
946 #endif
947 
948           bzero(&to, sizeof to);
949 
950           if (so->so_options & SO_ACCEPTCONN) {
951                     struct in_conninfo inc;
952 
953 #ifdef INET6
954                     inc.inc_isipv6 = (isipv6 == TRUE);
955 #endif
956                     if (isipv6) {
957                               inc.inc6_faddr = ip6->ip6_src;
958                               inc.inc6_laddr = ip6->ip6_dst;
959                               inc.inc6_route.ro_rt = NULL;            /* XXX */
960                     } else {
961                               inc.inc_faddr = ip->ip_src;
962                               inc.inc_laddr = ip->ip_dst;
963                               inc.inc_route.ro_rt = NULL;             /* XXX */
964                     }
965                     inc.inc_fport = th->th_sport;
966                     inc.inc_lport = th->th_dport;
967 
968                     /*
969                      * If the state is LISTEN then ignore segment if it contains
970                      * a RST.  If the segment contains an ACK then it is bad and
971                      * send a RST.  If it does not contain a SYN then it is not
972                      * interesting; drop it.
973                      *
974                      * If the state is SYN_RECEIVED (syncache) and seg contains
975                      * an ACK, but not for our SYN/ACK, send a RST.  If the seg
976                      * contains a RST, check the sequence number to see if it
977                      * is a valid reset segment.
978                      */
979                     if ((thflags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) {
980                               if ((thflags & (TH_RST | TH_ACK | TH_SYN)) == TH_ACK) {
981                                         if (!syncache_expand(&inc, th, &so, m)) {
982                                                   /*
983                                                    * No syncache entry, or ACK was not
984                                                    * for our SYN/ACK.  Send a RST.
985                                                    */
986                                                   tcpstat.tcps_badsyn++;
987                                                   rstreason = BANDLIM_RST_OPENPORT;
988                                                   goto dropwithreset;
989                                         }
990 
991                                         /*
992                                          * Could not complete 3-way handshake,
993                                          * connection is being closed down, and
994                                          * syncache will free mbuf.
995                                          */
996                                         if (so == NULL)
997                                                   return(IPPROTO_DONE);
998 
999                                         /*
1000                                          * We must be in the correct protocol thread
1001                                          * for this connection.
1002                                          */
1003                                         KKASSERT(so->so_port == &curthread->td_msgport);
1004 
1005                                         /*
1006                                          * Socket is created in state SYN_RECEIVED.
1007                                          * Continue processing segment.
1008                                          */
1009                                         inp = so->so_pcb;
1010                                         tp = intotcpcb(inp);
1011                                         /*
1012                                          * This is what would have happened in
1013                                          * tcp_output() when the SYN,ACK was sent.
1014                                          */
1015                                         tp->snd_up = tp->snd_una;
1016                                         tp->snd_max = tp->snd_nxt = tp->iss + 1;
1017                                         tp->last_ack_sent = tp->rcv_nxt;
1018 
1019                                         goto after_listen;
1020                               }
1021                               if (thflags & TH_RST) {
1022                                         syncache_chkrst(&inc, th);
1023                                         goto drop;
1024                               }
1025                               if (thflags & TH_ACK) {
1026                                         syncache_badack(&inc);
1027                                         tcpstat.tcps_badsyn++;
1028                                         rstreason = BANDLIM_RST_OPENPORT;
1029                                         goto dropwithreset;
1030                               }
1031                               goto drop;
1032                     }
1033 
1034                     /*
1035                      * Segment's flags are (SYN) or (SYN | FIN).
1036                      */
1037 #ifdef INET6
1038                     /*
1039                      * If deprecated address is forbidden,
1040                      * we do not accept SYN to deprecated interface
1041                      * address to prevent any new inbound connection from
1042                      * getting established.
1043                      * When we do not accept SYN, we send a TCP RST,
1044                      * with deprecated source address (instead of dropping
1045                      * it).  We compromise it as it is much better for peer
1046                      * to send a RST, and RST will be the final packet
1047                      * for the exchange.
1048                      *
1049                      * If we do not forbid deprecated addresses, we accept
1050                      * the SYN packet.  RFC2462 does not suggest dropping
1051                      * SYN in this case.
1052                      * If we decipher RFC2462 5.5.4, it says like this:
1053                      * 1. use of deprecated addr with existing
1054                      *    communication is okay - "SHOULD continue to be
1055                      *    used"
1056                      * 2. use of it with new communication:
1057                      *   (2a) "SHOULD NOT be used if alternate address
1058                      *          with sufficient scope is available"
1059                      *   (2b) nothing mentioned otherwise.
1060                      * Here we fall into (2b) case as we have no choice in
1061                      * our source address selection - we must obey the peer.
1062                      *
1063                      * The wording in RFC2462 is confusing, and there are
1064                      * multiple description text for deprecated address
1065                      * handling - worse, they are not exactly the same.
1066                      * I believe 5.5.4 is the best one, so we follow 5.5.4.
1067                      */
1068                     if (isipv6 && !ip6_use_deprecated) {
1069                               struct in6_ifaddr *ia6;
1070 
1071                               if ((ia6 = ip6_getdstifaddr(m)) &&
1072                                   (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1073                                         tp = NULL;
1074                                         rstreason = BANDLIM_RST_OPENPORT;
1075                                         goto dropwithreset;
1076                               }
1077                     }
1078 #endif
1079                     /*
1080                      * If it is from this socket, drop it, it must be forged.
1081                      * Don't bother responding if the destination was a broadcast.
1082                      */
1083                     if (th->th_dport == th->th_sport) {
1084                               if (isipv6) {
1085                                         if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1086                                                                    &ip6->ip6_src))
1087                                                   goto drop;
1088                               } else {
1089                                         if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1090                                                   goto drop;
1091                               }
1092                     }
1093                     /*
1094                      * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1095                      *
1096                      * Note that it is quite possible to receive unicast
1097                      * link-layer packets with a broadcast IP address. Use
1098                      * in_broadcast() to find them.
1099                      */
1100                     if (m->m_flags & (M_BCAST | M_MCAST))
1101                               goto drop;
1102                     if (isipv6) {
1103                               if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1104                                   IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1105                                         goto drop;
1106                     } else {
1107                               if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1108                                   IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1109                                   ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1110                                   in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1111                                         goto drop;
1112                     }
1113                     /*
1114                      * SYN appears to be valid; create compressed TCP state
1115                      * for syncache.
1116                      */
1117                     if (so->so_qlen <= so->so_qlimit) {
1118                               tcp_dooptions(&to, optp, optlen, TRUE, th->th_ack);
1119                               if (!syncache_add(&inc, &to, th, so, m))
1120                                         goto drop;
1121 
1122                               /*
1123                                * Entry added to syncache, mbuf used to
1124                                * send SYN,ACK packet.
1125                                */
1126                               return(IPPROTO_DONE);
1127                     }
1128                     goto drop;
1129           }
1130 
1131 after_listen:
1132           /*
1133            * Should not happen - syncache should pick up these connections.
1134            *
1135            * Once we are past handling listen sockets we must be in the
1136            * correct protocol processing thread.
1137            */
1138           KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
1139           KKASSERT(so->so_port == &curthread->td_msgport);
1140 
1141           /* Unscale the window into a 32-bit value. */
1142           if (!(thflags & TH_SYN))
1143                     tiwin = th->th_win << tp->snd_scale;
1144           else
1145                     tiwin = th->th_win;
1146 
1147           /*
1148            * This is the second part of the MSS DoS prevention code (after
1149            * minmss on the sending side) and it deals with too many too small
1150            * tcp packets in a too short timeframe (1 second).
1151            *
1152            * XXX Removed.  This code was crap.  It does not scale to network
1153            *     speed, and default values break NFS.  Gone.
1154            */
1155           /* REMOVED */
1156 
1157           /*
1158            * Segment received on connection.
1159            *
1160            * Reset idle time and keep-alive timer.  Don't waste time if less
1161            * then a second has elapsed.
1162            */
1163           if ((int)(ticks - tp->t_rcvtime) > hz)
1164                     tcp_timer_keep_activity(tp, thflags);
1165 
1166           /*
1167            * Process options.
1168            * XXX this is tradtitional behavior, may need to be cleaned up.
1169            */
1170           tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0, th->th_ack);
1171           if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1172                     if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) {
1173                               tp->t_flags |= TF_RCVD_SCALE;
1174                               tp->snd_scale = to.to_requested_s_scale;
1175                     }
1176 
1177                     /*
1178                      * Initial send window; will be updated upon next ACK
1179                      */
1180                     tp->snd_wnd = th->th_win;
1181 
1182                     if (to.to_flags & TOF_TS) {
1183                               tp->t_flags |= TF_RCVD_TSTMP;
1184                               tp->ts_recent = to.to_tsval;
1185                               tp->ts_recent_age = ticks;
1186                     }
1187                     if (!(to.to_flags & TOF_MSS))
1188                               to.to_mss = 0;
1189                     tcp_rmx_init(tp, to.to_mss);
1190                     /*
1191                      * Only set the TF_SACK_PERMITTED per-connection flag
1192                      * if we got a SACK_PERMITTED option from the other side
1193                      * and the global tcp_do_sack variable is true.
1194                      */
1195                     if (tcp_do_sack && (to.to_flags & TOF_SACK_PERMITTED))
1196                               tp->t_flags |= TF_SACK_PERMITTED;
1197           }
1198 
1199           /*
1200            * Header prediction: check for the two common cases
1201            * of a uni-directional data xfer.  If the packet has
1202            * no control flags, is in-sequence, the window didn't
1203            * change and we're not retransmitting, it's a
1204            * candidate.  If the length is zero and the ack moved
1205            * forward, we're the sender side of the xfer.  Just
1206            * free the data acked & wake any higher level process
1207            * that was blocked waiting for space.  If the length
1208            * is non-zero and the ack didn't move, we're the
1209            * receiver side.  If we're getting packets in-order
1210            * (the reassembly queue is empty), add the data to
1211            * the socket buffer and note that we need a delayed ack.
1212            * Make sure that the hidden state-flags are also off.
1213            * Since we check for TCPS_ESTABLISHED above, it can only
1214            * be TH_NEEDSYN.
1215            */
1216           if (tp->t_state == TCPS_ESTABLISHED &&
1217               (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1218               !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) &&
1219               (!(to.to_flags & TOF_TS) ||
1220                TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1221               th->th_seq == tp->rcv_nxt &&
1222               tp->snd_nxt == tp->snd_max) {
1223 
1224                     /*
1225                      * If last ACK falls within this segment's sequence numbers,
1226                      * record the timestamp.
1227                      * NOTE that the test is modified according to the latest
1228                      * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1229                      */
1230                     if ((to.to_flags & TOF_TS) &&
1231                         SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1232                               tp->ts_recent_age = ticks;
1233                               tp->ts_recent = to.to_tsval;
1234                     }
1235 
1236                     if (tlen == 0) {
1237                               if (SEQ_GT(th->th_ack, tp->snd_una) &&
1238                                   SEQ_LEQ(th->th_ack, tp->snd_max) &&
1239                                   tp->snd_cwnd >= tp->snd_wnd &&
1240                                   !IN_FASTRECOVERY(tp)) {
1241                                         /*
1242                                          * This is a pure ack for outstanding data.
1243                                          */
1244                                         ++tcpstat.tcps_predack;
1245                                         /*
1246                                          * "bad retransmit" recovery
1247                                          *
1248                                          * If Eifel detection applies, then
1249                                          * it is deterministic, so use it
1250                                          * unconditionally over the old heuristic.
1251                                          * Otherwise, fall back to the old heuristic.
1252                                          */
1253                                         if (tcp_do_eifel_detect &&
1254                                             (to.to_flags & TOF_TS) && to.to_tsecr &&
1255                                             (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
1256                                                   /* Eifel detection applicable. */
1257                                                   if (to.to_tsecr < tp->t_rexmtTS) {
1258                                                             tcp_revert_congestion_state(tp);
1259                                                             ++tcpstat.tcps_eifeldetected;
1260                                                             if (tp->t_rxtshift != 1 ||
1261                                                                 ticks >= tp->t_badrxtwin)
1262                                                                       ++tcpstat.tcps_rttcantdetect;
1263                                                   }
1264                                         } else if (tp->t_rxtshift == 1 &&
1265                                                      ticks < tp->t_badrxtwin) {
1266                                                   tcp_revert_congestion_state(tp);
1267                                                   ++tcpstat.tcps_rttdetected;
1268                                         }
1269                                         tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
1270                                             TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
1271                                         /*
1272                                          * Recalculate the retransmit timer / rtt.
1273                                          *
1274                                          * Some machines (certain windows boxes)
1275                                          * send broken timestamp replies during the
1276                                          * SYN+ACK phase, ignore timestamps of 0.
1277                                          */
1278                                         if ((to.to_flags & TOF_TS) && to.to_tsecr) {
1279                                                   tcp_xmit_timer(tp,
1280                                                                   ticks - to.to_tsecr + 1,
1281                                                                   th->th_ack);
1282                                         } else if (tp->t_rtttime &&
1283                                                      SEQ_GT(th->th_ack, tp->t_rtseq)) {
1284                                                   tcp_xmit_timer(tp,
1285                                                                   ticks - tp->t_rtttime + 1,
1286                                                                   th->th_ack);
1287                                         }
1288                                         tcp_xmit_bandwidth_limit(tp, th->th_ack);
1289                                         acked = th->th_ack - tp->snd_una;
1290                                         tcpstat.tcps_rcvackpack++;
1291                                         tcpstat.tcps_rcvackbyte += acked;
1292                                         sbdrop(&so->so_snd.sb, acked);
1293                                         tp->snd_recover = th->th_ack - 1;
1294                                         tp->snd_una = th->th_ack;
1295                                         tp->t_dupacks = 0;
1296                                         /*
1297                                          * Update window information.
1298                                          */
1299                                         if (tiwin != tp->snd_wnd &&
1300                                             acceptable_window_update(tp, th, tiwin)) {
1301                                                   /* keep track of pure window updates */
1302                                                   if (tp->snd_wl2 == th->th_ack &&
1303                                                       tiwin > tp->snd_wnd)
1304                                                             tcpstat.tcps_rcvwinupd++;
1305                                                   tp->snd_wnd = tiwin;
1306                                                   tp->snd_wl1 = th->th_seq;
1307                                                   tp->snd_wl2 = th->th_ack;
1308                                                   if (tp->snd_wnd > tp->max_sndwnd)
1309                                                             tp->max_sndwnd = tp->snd_wnd;
1310                                         }
1311                                         m_freem(m);
1312                                         ND6_HINT(tp); /* some progress has been done */
1313                                         /*
1314                                          * If all outstanding data are acked, stop
1315                                          * retransmit timer, otherwise restart timer
1316                                          * using current (possibly backed-off) value.
1317                                          * If process is waiting for space,
1318                                          * wakeup/selwakeup/signal.  If data
1319                                          * are ready to send, let tcp_output
1320                                          * decide between more output or persist.
1321                                          */
1322                                         if (tp->snd_una == tp->snd_max) {
1323                                                   tcp_callout_stop(tp, tp->tt_rexmt);
1324                                         } else if (!tcp_callout_active(tp,
1325                                                       tp->tt_persist)) {
1326                                                   tcp_callout_reset(tp, tp->tt_rexmt,
1327                                                       tp->t_rxtcur, tcp_timer_rexmt);
1328                                         }
1329                                         sowwakeup(so);
1330                                         if (so->so_snd.ssb_cc > 0 &&
1331                                             !tcp_output_pending(tp))
1332                                                   tcp_output_fair(tp);
1333                                         return(IPPROTO_DONE);
1334                               }
1335                     } else if (tiwin == tp->snd_wnd &&
1336                         th->th_ack == tp->snd_una &&
1337                         TAILQ_EMPTY(&tp->t_segq) &&
1338                         tlen <= ssb_space(&so->so_rcv)) {
1339                               u_long newsize = 0; /* automatic sockbuf scaling */
1340                               /*
1341                                * This is a pure, in-sequence data packet
1342                                * with nothing on the reassembly queue and
1343                                * we have enough buffer space to take it.
1344                                */
1345                               ++tcpstat.tcps_preddat;
1346                               tp->rcv_nxt += tlen;
1347                               tcpstat.tcps_rcvpack++;
1348                               tcpstat.tcps_rcvbyte += tlen;
1349                               ND6_HINT(tp);       /* some progress has been done */
1350                     /*
1351                      * Automatic sizing of receive socket buffer.  Often the send
1352                      * buffer size is not optimally adjusted to the actual network
1353                      * conditions at hand (delay bandwidth product).  Setting the
1354                      * buffer size too small limits throughput on links with high
1355                      * bandwidth and high delay (eg. trans-continental/oceanic links).
1356                      *
1357                      * On the receive side the socket buffer memory is only rarely
1358                      * used to any significant extent.  This allows us to be much
1359                      * more aggressive in scaling the receive socket buffer.  For
1360                      * the case that the buffer space is actually used to a large
1361                      * extent and we run out of kernel memory we can simply drop
1362                      * the new segments; TCP on the sender will just retransmit it
1363                      * later.  Setting the buffer size too big may only consume too
1364                      * much kernel memory if the application doesn't read() from
1365                      * the socket or packet loss or reordering makes use of the
1366                      * reassembly queue.
1367                      *
1368                      * The criteria to step up the receive buffer one notch are:
1369                      *  1. the number of bytes received during the time it takes
1370                      *     one timestamp to be reflected back to us (the RTT);
1371                      *  2. received bytes per RTT is within seven eighth of the
1372                      *     current socket buffer size;
1373                      *  3. receive buffer size has not hit maximal automatic size;
1374                      *
1375                      * This algorithm does one step per RTT at most and only if
1376                      * we receive a bulk stream w/o packet losses or reorderings.
1377                      * Shrinking the buffer during idle times is not necessary as
1378                      * it doesn't consume any memory when idle.
1379                      *
1380                      * TODO: Only step up if the application is actually serving
1381                      * the buffer to better manage the socket buffer resources.
1382                      */
1383                               if (tcp_do_autorcvbuf &&
1384                                   to.to_tsecr &&
1385                                   (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) {
1386                                         if (to.to_tsecr > tp->rfbuf_ts &&
1387                                             to.to_tsecr - tp->rfbuf_ts < hz) {
1388                                                   if (tp->rfbuf_cnt >
1389                                                       (so->so_rcv.ssb_hiwat / 8 * 7) &&
1390                                                       so->so_rcv.ssb_hiwat <
1391                                                       tcp_autorcvbuf_max) {
1392                                                             newsize =
1393                                                                 ulmin(so->so_rcv.ssb_hiwat +
1394                                                                         tcp_autorcvbuf_inc,
1395                                                                         tcp_autorcvbuf_max);
1396                                                   }
1397                                                   /* Start over with next RTT. */
1398                                                   tp->rfbuf_ts = 0;
1399                                                   tp->rfbuf_cnt = 0;
1400                                         } else
1401                                                   tp->rfbuf_cnt += tlen;        /* add up */
1402                               }
1403                               /*
1404                                * Add data to socket buffer.
1405                                */
1406                               if (so->so_state & SS_CANTRCVMORE) {
1407                                         m_freem(m);
1408                               } else {
1409                                         /*
1410                                          * Set new socket buffer size, give up when
1411                                          * limit is reached.
1412                                          *
1413                                          * Adjusting the size can mess up ACK
1414                                          * sequencing when pure window updates are
1415                                          * being avoided (which is the default),
1416                                          * so force an ack.
1417                                          */
1418                                         lwkt_gettoken(&so->so_rcv.ssb_token);
1419                                         if (newsize) {
1420                                                   tp->t_flags |= TF_RXRESIZED;
1421                                                   if (!ssb_reserve(&so->so_rcv, newsize,
1422                                                                        so, NULL)) {
1423                                                             atomic_clear_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE);
1424                                                   }
1425                                                   if (newsize >=
1426                                                       (TCP_MAXWIN << tp->rcv_scale)) {
1427                                                             atomic_clear_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE);
1428                                                   }
1429                                         }
1430                                         m_adj(m, drop_hdrlen); /* delayed header drop */
1431                                         ssb_appendstream(&so->so_rcv, m);
1432                                         lwkt_reltoken(&so->so_rcv.ssb_token);
1433                               }
1434                               sorwakeup(so);
1435                               /*
1436                                * This code is responsible for most of the ACKs
1437                                * the TCP stack sends back after receiving a data
1438                                * packet.  Note that the DELAY_ACK check fails if
1439                                * the delack timer is already running, which results
1440                                * in an ack being sent every other packet (which is
1441                                * what we want).
1442                                *
1443                                * We then further aggregate acks by not actually
1444                                * sending one until the protocol thread has completed
1445                                * processing the current backlog of packets.  This
1446                                * does not delay the ack any further, but allows us
1447                                * to take advantage of the packet aggregation that
1448                                * high speed NICs do (usually blocks of 8-10 packets)
1449                                * to send a single ack rather then four or five acks,
1450                                * greatly reducing the ack rate, the return channel
1451                                * bandwidth, and the protocol overhead on both ends.
1452                                *
1453                                * Since this also has the effect of slowing down
1454                                * the exponential slow-start ramp-up, systems with
1455                                * very large bandwidth-delay products might want
1456                                * to turn the feature off.
1457                                */
1458                               if (DELAY_ACK(tp)) {
1459                                         tcp_callout_reset(tp, tp->tt_delack,
1460                                             tcp_delacktime, tcp_timer_delack);
1461                               } else if (tcp_aggregate_acks) {
1462                                         tp->t_flags |= TF_ACKNOW;
1463                                         if (!(tp->t_flags & TF_ONOUTPUTQ)) {
1464                                                   tp->t_flags |= TF_ONOUTPUTQ;
1465                                                   tp->tt_cpu = mycpu->gd_cpuid;
1466                                                   TAILQ_INSERT_TAIL(
1467                                                       &tcpcbackq[tp->tt_cpu].head,
1468                                                       tp, t_outputq);
1469                                         }
1470                               } else {
1471                                         tp->t_flags |= TF_ACKNOW;
1472                                         tcp_output(tp);
1473                               }
1474                               return(IPPROTO_DONE);
1475                     }
1476           }
1477 
1478           /*
1479            * Calculate amount of space in receive window,
1480            * and then do TCP input processing.
1481            * Receive window is amount of space in rcv queue,
1482            * but not less than advertised window.
1483            */
1484           recvwin = ssb_space(&so->so_rcv);
1485           if (recvwin < 0)
1486                     recvwin = 0;
1487           tp->rcv_wnd = imax(recvwin, (int)(tp->rcv_adv - tp->rcv_nxt));
1488 
1489           /* Reset receive buffer auto scaling when not in bulk receive mode. */
1490           tp->rfbuf_ts = 0;
1491           tp->rfbuf_cnt = 0;
1492 
1493           switch (tp->t_state) {
1494           /*
1495            * If the state is SYN_RECEIVED:
1496            *        if seg contains an ACK, but not for our SYN/ACK, send a RST.
1497            */
1498           case TCPS_SYN_RECEIVED:
1499                     if ((thflags & TH_ACK) &&
1500                         (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1501                          SEQ_GT(th->th_ack, tp->snd_max))) {
1502                               rstreason = BANDLIM_RST_OPENPORT;
1503                               goto dropwithreset;
1504                     }
1505                     break;
1506 
1507           /*
1508            * If the state is SYN_SENT:
1509            *        if seg contains an ACK, but not for our SYN, drop the input.
1510            *        if seg contains a RST, then drop the connection.
1511            *        if seg does not contain SYN, then drop it.
1512            * Otherwise this is an acceptable SYN segment
1513            *        initialize tp->rcv_nxt and tp->irs
1514            *        if seg contains ack then advance tp->snd_una
1515            *        if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1516            *        arrange for segment to be acked (eventually)
1517            *        continue processing rest of data/controls, beginning with URG
1518            */
1519           case TCPS_SYN_SENT:
1520                     if ((thflags & TH_ACK) &&
1521                         (SEQ_LEQ(th->th_ack, tp->iss) ||
1522                          SEQ_GT(th->th_ack, tp->snd_max))) {
1523                               rstreason = BANDLIM_UNLIMITED;
1524                               goto dropwithreset;
1525                     }
1526                     if (thflags & TH_RST) {
1527                               if (thflags & TH_ACK)
1528                                         tp = tcp_drop(tp, ECONNREFUSED);
1529                               goto drop;
1530                     }
1531                     if (!(thflags & TH_SYN))
1532                               goto drop;
1533 
1534                     tp->irs = th->th_seq;
1535                     tcp_rcvseqinit(tp);
1536                     if (thflags & TH_ACK) {
1537                               /* Our SYN was acked. */
1538                               tcpstat.tcps_connects++;
1539                               soisconnected(so);
1540                               /* Do window scaling on this connection? */
1541                               if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
1542                                   (TF_RCVD_SCALE | TF_REQ_SCALE))
1543                                         tp->rcv_scale = tp->request_r_scale;
1544                               tp->rcv_adv += tp->rcv_wnd;
1545                               tp->snd_una++;                /* SYN is acked */
1546                               tcp_callout_stop(tp, tp->tt_rexmt);
1547                               /*
1548                                * If there's data, delay ACK; if there's also a FIN
1549                                * ACKNOW will be turned on later.
1550                                */
1551                               if (DELAY_ACK(tp) && tlen != 0) {
1552                                         tcp_callout_reset(tp, tp->tt_delack,
1553                                             tcp_delacktime, tcp_timer_delack);
1554                               } else {
1555                                         tp->t_flags |= TF_ACKNOW;
1556                               }
1557                               /*
1558                                * Received <SYN,ACK> in SYN_SENT[*] state.
1559                                * Transitions:
1560                                *        SYN_SENT  --> ESTABLISHED
1561                                *        SYN_SENT* --> FIN_WAIT_1
1562                                */
1563                               tp->t_starttime = ticks;
1564                               if (tp->t_flags & TF_NEEDFIN) {
1565                                         TCP_STATE_CHANGE(tp, TCPS_FIN_WAIT_1);
1566                                         tp->t_flags &= ~TF_NEEDFIN;
1567                                         thflags &= ~TH_SYN;
1568                               } else {
1569                                         tcp_established(tp);
1570                               }
1571                     } else {
1572                               /*
1573                                * Received initial SYN in SYN-SENT[*] state =>
1574                                * simultaneous open.
1575                                * Do 3-way handshake:
1576                                *          SYN-SENT -> SYN-RECEIVED
1577                                *          SYN-SENT* -> SYN-RECEIVED*
1578                                */
1579                               tp->t_flags |= TF_ACKNOW;
1580                               tcp_callout_stop(tp, tp->tt_rexmt);
1581                               TCP_STATE_CHANGE(tp, TCPS_SYN_RECEIVED);
1582                     }
1583 
1584                     /*
1585                      * Advance th->th_seq to correspond to first data byte.
1586                      * If data, trim to stay within window,
1587                      * dropping FIN if necessary.
1588                      */
1589                     th->th_seq++;
1590                     if (tlen > tp->rcv_wnd) {
1591                               todrop = tlen - tp->rcv_wnd;
1592                               m_adj(m, -todrop);
1593                               tlen = tp->rcv_wnd;
1594                               thflags &= ~TH_FIN;
1595                               tcpstat.tcps_rcvpackafterwin++;
1596                               tcpstat.tcps_rcvbyteafterwin += todrop;
1597                     }
1598                     tp->snd_wl1 = th->th_seq - 1;
1599                     tp->rcv_up = th->th_seq;
1600                     /*
1601                      * Client side of transaction: already sent SYN and data.
1602                      * If the remote host used T/TCP to validate the SYN,
1603                      * our data will be ACK'd; if so, enter normal data segment
1604                      * processing in the middle of step 5, ack processing.
1605                      * Otherwise, goto step 6.
1606                      */
1607                     if (thflags & TH_ACK)
1608                               goto process_ACK;
1609 
1610                     goto step6;
1611 
1612           /*
1613            * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1614            *        do normal processing (we no longer bother with T/TCP).
1615            */
1616           case TCPS_LAST_ACK:
1617           case TCPS_CLOSING:
1618           case TCPS_TIME_WAIT:
1619                     /*
1620                      * Continue normal processing
1621                      */
1622                     break;
1623           }
1624 
1625           /*
1626            * States other than LISTEN or SYN_SENT.
1627            * First check the RST flag and sequence number since reset segments
1628            * are exempt from the timestamp and connection count tests.  This
1629            * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1630            * below which allowed reset segments in half the sequence space
1631            * to fall though and be processed (which gives forged reset
1632            * segments with a random sequence number a 50 percent chance of
1633            * killing a connection).
1634            * Then check timestamp, if present.
1635            * Then check the connection count, if present.
1636            * Then check that at least some bytes of segment are within
1637            * receive window.  If segment begins before rcv_nxt,
1638            * drop leading data (and SYN); if nothing left, just ack.
1639            *
1640            *
1641            * If the RST bit is set, check the sequence number to see
1642            * if this is a valid reset segment.
1643            * RFC 793 page 37:
1644            *   In all states except SYN-SENT, all reset (RST) segments
1645            *   are validated by checking their SEQ-fields.  A reset is
1646            *   valid if its sequence number is in the window.
1647            * Note: this does not take into account delayed ACKs, so
1648            *   we should test against last_ack_sent instead of rcv_nxt.
1649            *   The sequence number in the reset segment is normally an
1650            *   echo of our outgoing acknowledgement numbers, but some hosts
1651            *   send a reset with the sequence number at the rightmost edge
1652            *   of our receive window, and we have to handle this case.
1653            * If we have multiple segments in flight, the intial reset
1654            * segment sequence numbers will be to the left of last_ack_sent,
1655            * but they will eventually catch up.
1656            * In any case, it never made sense to trim reset segments to
1657            * fit the receive window since RFC 1122 says:
1658            *   4.2.2.12  RST Segment: RFC-793 Section 3.4
1659            *
1660            *    A TCP SHOULD allow a received RST segment to include data.
1661            *
1662            *    DISCUSSION
1663            *           It has been suggested that a RST segment could contain
1664            *           ASCII text that encoded and explained the cause of the
1665            *           RST.  No standard has yet been established for such
1666            *           data.
1667            *
1668            * If the reset segment passes the sequence number test examine
1669            * the state:
1670            *    SYN_RECEIVED STATE:
1671            *        If passive open, return to LISTEN state.
1672            *        If active open, inform user that connection was refused.
1673            *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
1674            *        Inform user that connection was reset, and close tcb.
1675            *    CLOSING, LAST_ACK STATES:
1676            *        Close the tcb.
1677            *    TIME_WAIT STATE:
1678            *        Drop the segment - see Stevens, vol. 2, p. 964 and
1679            *        RFC 1337.
1680            */
1681           if (thflags & TH_RST) {
1682                     if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1683                         SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1684                               switch (tp->t_state) {
1685 
1686                               case TCPS_SYN_RECEIVED:
1687                                         so->so_error = ECONNREFUSED;
1688                                         goto close;
1689 
1690                               case TCPS_ESTABLISHED:
1691                               case TCPS_FIN_WAIT_1:
1692                               case TCPS_FIN_WAIT_2:
1693                               case TCPS_CLOSE_WAIT:
1694                                         so->so_error = ECONNRESET;
1695                               close:
1696                                         TCP_STATE_CHANGE(tp, TCPS_CLOSED);
1697                                         tcpstat.tcps_drops++;
1698                                         tp = tcp_close(tp);
1699                                         break;
1700 
1701                               case TCPS_CLOSING:
1702                               case TCPS_LAST_ACK:
1703                                         tp = tcp_close(tp);
1704                                         break;
1705 
1706                               case TCPS_TIME_WAIT:
1707                                         break;
1708                               }
1709                     }
1710                     goto drop;
1711           }
1712 
1713           /*
1714            * Allow a new connection to replace an existing connection that is
1715            * in the TIME_WAIT state if the new connection's SYN seq is different
1716            * from tp->irs.
1717            */
1718           if ((thflags & TH_SYN) &&
1719               tp->t_state == TCPS_TIME_WAIT &&
1720               th->th_seq != tp->irs) {
1721                     tp = tcp_close(tp);
1722                     goto findpcb;
1723           }
1724 
1725           /*
1726            * RFC 1323 PAWS: If we have a timestamp reply on this segment
1727            * and it's less than ts_recent, drop it.
1728            */
1729           if ((to.to_flags & TOF_TS) && tp->ts_recent != 0 &&
1730               TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1731                     /* Check to see if ts_recent is over 24 days old.  */
1732                     if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1733                               /*
1734                                * Invalidate ts_recent.  If this segment updates
1735                                * ts_recent, the age will be reset later and ts_recent
1736                                * will get a valid value.  If it does not, setting
1737                                * ts_recent to zero will at least satisfy the
1738                                * requirement that zero be placed in the timestamp
1739                                * echo reply when ts_recent isn't valid.  The
1740                                * age isn't reset until we get a valid ts_recent
1741                                * because we don't want out-of-order segments to be
1742                                * dropped when ts_recent is old.
1743                                */
1744                               tp->ts_recent = 0;
1745                     } else if (tcp_paws_tolerance && tlen != 0 &&
1746                         tp->t_state == TCPS_ESTABLISHED &&
1747                         (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK&&
1748                         !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) &&
1749                         th->th_ack == tp->snd_una &&
1750                         tiwin == tp->snd_wnd &&
1751                         TSTMP_GEQ(to.to_tsval + tcp_paws_tolerance, tp->ts_recent)&&
1752                         (th->th_seq == tp->rcv_nxt ||
1753                          (SEQ_GT(th->th_seq, tp->rcv_nxt) &&
1754                           tcp_paws_canreasslast(tp, th, tlen)))) {
1755                               /*
1756                                * This tends to prevent valid new segments from being
1757                                * dropped by the reordered segments sent by the fast
1758                                * retransmission algorithm on the sending side, i.e.
1759                                * the fast retransmitted segment w/ larger timestamp
1760                                * arrives earlier than the previously sent new segments
1761                                * w/ smaller timestamp.
1762                                *
1763                                * If following conditions are met, the segment is
1764                                * accepted:
1765                                * - The segment contains data
1766                                * - The connection is established
1767                                * - The header does not contain important flags
1768                                * - SYN or FIN is not needed
1769                                * - It does not acknowledge new data
1770                                * - Receive window is not changed
1771                                * - The timestamp is within "acceptable" range
1772                                * - The new segment is what we are expecting or
1773                                *   the new segment could be merged w/ the last
1774                                *   pending segment on the reassemble queue
1775                                */
1776                               tcpstat.tcps_pawsaccept++;
1777                               tcpstat.tcps_pawsdrop++;
1778                     } else {
1779                               tcpstat.tcps_rcvduppack++;
1780                               tcpstat.tcps_rcvdupbyte += tlen;
1781                               tcpstat.tcps_pawsdrop++;
1782                               if (tlen)
1783                                         goto dropafterack;
1784                               goto drop;
1785                     }
1786           }
1787 
1788           /*
1789            * In the SYN-RECEIVED state, validate that the packet belongs to
1790            * this connection before trimming the data to fit the receive
1791            * window.  Check the sequence number versus IRS since we know
1792            * the sequence numbers haven't wrapped.  This is a partial fix
1793            * for the "LAND" DoS attack.
1794            */
1795           if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1796                     rstreason = BANDLIM_RST_OPENPORT;
1797                     goto dropwithreset;
1798           }
1799 
1800           todrop = tp->rcv_nxt - th->th_seq;
1801           if (todrop > 0) {
1802                     if (TCP_DO_SACK(tp)) {
1803                               /* Report duplicate segment at head of packet. */
1804                               tp->reportblk.rblk_start = th->th_seq;
1805                               tp->reportblk.rblk_end = TCP_SACK_BLKEND(
1806                                   th->th_seq + tlen, thflags);
1807                               if (SEQ_GT(tp->reportblk.rblk_end, tp->rcv_nxt))
1808                                         tp->reportblk.rblk_end = tp->rcv_nxt;
1809                               tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_SACKLEFT);
1810                               tp->t_flags |= TF_ACKNOW;
1811                     }
1812                     if (thflags & TH_SYN) {
1813                               thflags &= ~TH_SYN;
1814                               th->th_seq++;
1815                               if (th->th_urp > 1)
1816                                         th->th_urp--;
1817                               else
1818                                         thflags &= ~TH_URG;
1819                               todrop--;
1820                     }
1821                     /*
1822                      * Following if statement from Stevens, vol. 2, p. 960.
1823                      */
1824                     if (todrop > tlen ||
1825                         (todrop == tlen && !(thflags & TH_FIN))) {
1826                               /*
1827                                * Any valid FIN must be to the left of the window.
1828                                * At this point the FIN must be a duplicate or out
1829                                * of sequence; drop it.
1830                                */
1831                               thflags &= ~TH_FIN;
1832 
1833                               /*
1834                                * Send an ACK to resynchronize and drop any data.
1835                                * But keep on processing for RST or ACK.
1836                                */
1837                               tp->t_flags |= TF_ACKNOW;
1838                               todrop = tlen;
1839                               tcpstat.tcps_rcvduppack++;
1840                               tcpstat.tcps_rcvdupbyte += todrop;
1841                     } else {
1842                               tcpstat.tcps_rcvpartduppack++;
1843                               tcpstat.tcps_rcvpartdupbyte += todrop;
1844                     }
1845                     drop_hdrlen += todrop;        /* drop from the top afterwards */
1846                     th->th_seq += todrop;
1847                     tlen -= todrop;
1848                     if (th->th_urp > todrop)
1849                               th->th_urp -= todrop;
1850                     else {
1851                               thflags &= ~TH_URG;
1852                               th->th_urp = 0;
1853                     }
1854           }
1855 
1856           /*
1857            * If new data are received on a connection after the
1858            * user processes are gone, then RST the other end.
1859            */
1860           if ((so->so_state & SS_NOFDREF) &&
1861               tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1862                     tp = tcp_close(tp);
1863                     tcpstat.tcps_rcvafterclose++;
1864                     rstreason = BANDLIM_UNLIMITED;
1865                     goto dropwithreset;
1866           }
1867 
1868           /*
1869            * If segment ends after window, drop trailing data
1870            * (and PUSH and FIN); if nothing left, just ACK.
1871            */
1872           todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1873           if (todrop > 0) {
1874                     tcpstat.tcps_rcvpackafterwin++;
1875                     if (todrop >= tlen) {
1876                               tcpstat.tcps_rcvbyteafterwin += tlen;
1877 
1878                               /*
1879                                * If window is closed can only take segments at
1880                                * window edge, and have to drop data and PUSH from
1881                                * incoming segments.  Continue processing, but
1882                                * remember to ack.  Otherwise, drop segment
1883                                * and ack.
1884                                */
1885                               if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1886                                         tp->t_flags |= TF_ACKNOW;
1887                                         tcpstat.tcps_rcvwinprobe++;
1888                               } else
1889                                         goto dropafterack;
1890                     } else
1891                               tcpstat.tcps_rcvbyteafterwin += todrop;
1892                     m_adj(m, -todrop);
1893                     tlen -= todrop;
1894                     thflags &= ~(TH_PUSH | TH_FIN);
1895           }
1896 
1897           /*
1898            * If last ACK falls within this segment's sequence numbers,
1899            * record its timestamp.
1900            * NOTE:
1901            * 1) That the test incorporates suggestions from the latest
1902            *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
1903            * 2) That updating only on newer timestamps interferes with
1904            *    our earlier PAWS tests, so this check should be solely
1905            *    predicated on the sequence space of this segment.
1906            * 3) That we modify the segment boundary check to be
1907            *        Last.ACK.Sent <= SEG.SEQ + SEG.LEN
1908            *    instead of RFC1323's
1909            *        Last.ACK.Sent < SEG.SEQ + SEG.LEN,
1910            *    This modified check allows us to overcome RFC1323's
1911            *    limitations as described in Stevens TCP/IP Illustrated
1912            *    Vol. 2 p.869. In such cases, we can still calculate the
1913            *    RTT correctly when RCV.NXT == Last.ACK.Sent.
1914            */
1915           if ((to.to_flags & TOF_TS) && SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1916               SEQ_LEQ(tp->last_ack_sent, (th->th_seq + tlen
1917                                                   + ((thflags & TH_SYN) != 0)
1918                                                   + ((thflags & TH_FIN) != 0)))) {
1919                     tp->ts_recent_age = ticks;
1920                     tp->ts_recent = to.to_tsval;
1921           }
1922 
1923           /*
1924            * If a SYN is in the window, then this is an
1925            * error and we send an RST and drop the connection.
1926            */
1927           if (thflags & TH_SYN) {
1928                     tp = tcp_drop(tp, ECONNRESET);
1929                     rstreason = BANDLIM_UNLIMITED;
1930                     goto dropwithreset;
1931           }
1932 
1933           /*
1934            * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
1935            * flag is on (half-synchronized state), then queue data for
1936            * later processing; else drop segment and return.
1937            */
1938           if (!(thflags & TH_ACK)) {
1939                     if (tp->t_state == TCPS_SYN_RECEIVED ||
1940                         (tp->t_flags & TF_NEEDSYN))
1941                               goto step6;
1942                     else
1943                               goto drop;
1944           }
1945 
1946           /*
1947            * Ack processing.
1948            */
1949           switch (tp->t_state) {
1950           /*
1951            * In SYN_RECEIVED state, the ACK acknowledges our SYN, so enter
1952            * ESTABLISHED state and continue processing.
1953            * The ACK was checked above.
1954            */
1955           case TCPS_SYN_RECEIVED:
1956 
1957                     tcpstat.tcps_connects++;
1958                     soisconnected(so);
1959                     /* Do window scaling? */
1960                     if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
1961                         (TF_RCVD_SCALE | TF_REQ_SCALE))
1962                               tp->rcv_scale = tp->request_r_scale;
1963                     /*
1964                      * Make transitions:
1965                      *      SYN-RECEIVED  -> ESTABLISHED
1966                      *      SYN-RECEIVED* -> FIN-WAIT-1
1967                      */
1968                     tp->t_starttime = ticks;
1969                     if (tp->t_flags & TF_NEEDFIN) {
1970                               TCP_STATE_CHANGE(tp, TCPS_FIN_WAIT_1);
1971                               tp->t_flags &= ~TF_NEEDFIN;
1972                     } else {
1973                               tcp_established(tp);
1974                     }
1975                     /*
1976                      * If segment contains data or ACK, will call tcp_reass()
1977                      * later; if not, do so now to pass queued data to user.
1978                      */
1979                     if (tlen == 0 && !(thflags & TH_FIN))
1980                               tcp_reass(tp, NULL, NULL, NULL);
1981                     /* fall into ... */
1982 
1983           /*
1984            * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1985            * ACKs.  If the ack is in the range
1986            *        tp->snd_una < th->th_ack <= tp->snd_max
1987            * then advance tp->snd_una to th->th_ack and drop
1988            * data from the retransmission queue.  If this ACK reflects
1989            * more up to date window information we update our window information.
1990            */
1991           case TCPS_ESTABLISHED:
1992           case TCPS_FIN_WAIT_1:
1993           case TCPS_FIN_WAIT_2:
1994           case TCPS_CLOSE_WAIT:
1995           case TCPS_CLOSING:
1996           case TCPS_LAST_ACK:
1997           case TCPS_TIME_WAIT:
1998 
1999                     if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2000                               boolean_t maynotdup = FALSE;
2001 
2002                               if (TCP_DO_SACK(tp))
2003                                         tcp_sack_update_scoreboard(tp, &to);
2004 
2005                               if (tlen != 0 || tiwin != tp->snd_wnd ||
2006                                   ((thflags & TH_FIN) && !(tp->t_flags & TF_SAWFIN)))
2007                                         maynotdup = TRUE;
2008 
2009                               if (!tcp_callout_active(tp, tp->tt_rexmt) ||
2010                                   th->th_ack != tp->snd_una) {
2011                                         if (!maynotdup)
2012                                                   tcpstat.tcps_rcvdupack++;
2013                                         tp->t_dupacks = 0;
2014                                         break;
2015                               }
2016 
2017 #define DELAY_DUPACK \
2018 do { \
2019           delayed_dupack = TRUE; \
2020           th_dupack = th->th_ack; \
2021           to_flags = to.to_flags; \
2022 } while (0)
2023                               if (maynotdup) {
2024                                         if (!tcp_do_rfc6675 ||
2025                                             !TCP_DO_SACK(tp) ||
2026                                             (to.to_flags &
2027                                              (TOF_SACK | TOF_SACK_REDUNDANT))
2028                                              != TOF_SACK) {
2029                                                   tp->t_dupacks = 0;
2030                                         } else {
2031                                                   DELAY_DUPACK;
2032                                         }
2033                                         break;
2034                               }
2035                               if ((thflags & TH_FIN) && !(tp->t_flags & TF_QUEDFIN)) {
2036                                         /*
2037                                          * This could happen, if the reassemable
2038                                          * queue overflew or was drained.  Don't
2039                                          * drop this FIN here; defer the duplicated
2040                                          * ACK processing until this FIN gets queued.
2041                                          */
2042                                         DELAY_DUPACK;
2043                                         break;
2044                               }
2045 #undef DELAY_DUPACK
2046 
2047                               if (tcp_recv_dupack(tp, th->th_ack, to.to_flags))
2048                                         goto drop;
2049                               else
2050                                         break;
2051                     }
2052 
2053                     KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una"));
2054                     tp->t_dupacks = 0;
2055                     if (SEQ_GT(th->th_ack, tp->snd_max)) {
2056                               /*
2057                                * Detected optimistic ACK attack.
2058                                * Force slow-start to de-synchronize attack.
2059                                */
2060                               tp->snd_cwnd = tp->t_maxseg;
2061                               tp->snd_wacked = 0;
2062 
2063                               tcpstat.tcps_rcvacktoomuch++;
2064                               goto dropafterack;
2065                     }
2066                     /*
2067                      * If we reach this point, ACK is not a duplicate,
2068                      *     i.e., it ACKs something we sent.
2069                      */
2070                     if (tp->t_flags & TF_NEEDSYN) {
2071                               /*
2072                                * T/TCP: Connection was half-synchronized, and our
2073                                * SYN has been ACK'd (so connection is now fully
2074                                * synchronized).  Go to non-starred state,
2075                                * increment snd_una for ACK of SYN, and check if
2076                                * we can do window scaling.
2077                                */
2078                               tp->t_flags &= ~TF_NEEDSYN;
2079                               tp->snd_una++;
2080                               /* Do window scaling? */
2081                               if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
2082                                   (TF_RCVD_SCALE | TF_REQ_SCALE))
2083                                         tp->rcv_scale = tp->request_r_scale;
2084                     }
2085 
2086 process_ACK:
2087                     acked = th->th_ack - tp->snd_una;
2088                     tcpstat.tcps_rcvackpack++;
2089                     tcpstat.tcps_rcvackbyte += acked;
2090 
2091                     if (tcp_do_eifel_detect && acked > 0 &&
2092                         (to.to_flags & TOF_TS) && (to.to_tsecr != 0) &&
2093                         (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
2094                               /* Eifel detection applicable. */
2095                               if (to.to_tsecr < tp->t_rexmtTS) {
2096                                         ++tcpstat.tcps_eifeldetected;
2097                                         tcp_revert_congestion_state(tp);
2098                                         if (tp->t_rxtshift != 1 ||
2099                                             ticks >= tp->t_badrxtwin)
2100                                                   ++tcpstat.tcps_rttcantdetect;
2101                               }
2102                     } else if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
2103                               /*
2104                                * If we just performed our first retransmit,
2105                                * and the ACK arrives within our recovery window,
2106                                * then it was a mistake to do the retransmit
2107                                * in the first place.  Recover our original cwnd
2108                                * and ssthresh, and proceed to transmit where we
2109                                * left off.
2110                                */
2111                               tcp_revert_congestion_state(tp);
2112                               ++tcpstat.tcps_rttdetected;
2113                     }
2114 
2115                     /*
2116                      * If we have a timestamp reply, update smoothed
2117                      * round trip time.  If no timestamp is present but
2118                      * transmit timer is running and timed sequence
2119                      * number was acked, update smoothed round trip time.
2120                      * Since we now have an rtt measurement, cancel the
2121                      * timer backoff (cf., Phil Karn's retransmit alg.).
2122                      * Recompute the initial retransmit timer.
2123                      *
2124                      * Some machines (certain windows boxes) send broken
2125                      * timestamp replies during the SYN+ACK phase, ignore
2126                      * timestamps of 0.
2127                      */
2128                     if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0))
2129                               tcp_xmit_timer(tp, ticks - to.to_tsecr + 1,
2130                                                th->th_ack);
2131                     else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2132                               tcp_xmit_timer(tp, ticks - tp->t_rtttime + 1,
2133                                                th->th_ack);
2134                     tcp_xmit_bandwidth_limit(tp, th->th_ack);
2135 
2136                     /*
2137                      * If no data (only SYN) was ACK'd,
2138                      *    skip rest of ACK processing.
2139                      */
2140                     if (acked == 0)
2141                               goto step6;
2142 
2143                     /* Stop looking for an acceptable ACK since one was received. */
2144                     tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
2145                         TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
2146 
2147                     if (acked > so->so_snd.ssb_cc) {
2148                               tp->snd_wnd -= so->so_snd.ssb_cc;
2149                               sbdrop(&so->so_snd.sb, (int)so->so_snd.ssb_cc);
2150                               ourfinisacked = TRUE;
2151                     } else {
2152                               sbdrop(&so->so_snd.sb, acked);
2153                               tp->snd_wnd -= acked;
2154                               ourfinisacked = FALSE;
2155                     }
2156                     sowwakeup(so);
2157 
2158                     /*
2159                      * Update window information.
2160                      */
2161                     if (acceptable_window_update(tp, th, tiwin)) {
2162                               /* keep track of pure window updates */
2163                               if (tlen == 0 && tp->snd_wl2 == th->th_ack &&
2164                                   tiwin > tp->snd_wnd)
2165                                         tcpstat.tcps_rcvwinupd++;
2166                               tp->snd_wnd = tiwin;
2167                               tp->snd_wl1 = th->th_seq;
2168                               tp->snd_wl2 = th->th_ack;
2169                               if (tp->snd_wnd > tp->max_sndwnd)
2170                                         tp->max_sndwnd = tp->snd_wnd;
2171                               needoutput = TRUE;
2172                     }
2173 
2174                     tp->snd_una = th->th_ack;
2175                     if (TCP_DO_SACK(tp))
2176                               tcp_sack_update_scoreboard(tp, &to);
2177                     if (IN_FASTRECOVERY(tp)) {
2178                               if (SEQ_GEQ(th->th_ack, tp->snd_recover)) {
2179                                         EXIT_FASTRECOVERY(tp);
2180                                         needoutput = TRUE;
2181                                         /*
2182                                          * If the congestion window was inflated
2183                                          * to account for the other side's
2184                                          * cached packets, retract it.
2185                                          */
2186                                         if (!TCP_DO_SACK(tp))
2187                                                   tp->snd_cwnd = tp->snd_ssthresh;
2188 
2189                                         /*
2190                                          * Window inflation should have left us
2191                                          * with approximately snd_ssthresh outstanding
2192                                          * data.  But, in case we would be inclined
2193                                          * to send a burst, better do it using
2194                                          * slow start.
2195                                          */
2196                                         if (SEQ_GT(th->th_ack + tp->snd_cwnd,
2197                                                      tp->snd_max + 2 * tp->t_maxseg))
2198                                                   tp->snd_cwnd =
2199                                                       (tp->snd_max - tp->snd_una) +
2200                                                       2 * tp->t_maxseg;
2201 
2202                                         tp->snd_wacked = 0;
2203                               } else {
2204                                         if (TCP_DO_SACK(tp)) {
2205                                                   tp->snd_max_rexmt = tp->snd_max;
2206                                                   tcp_sack_rexmt(tp,
2207                                                       tp->snd_una == tp->rexmt_high);
2208                                         } else {
2209                                                   tcp_newreno_partial_ack(tp, th, acked);
2210                                         }
2211                                         needoutput = FALSE;
2212                               }
2213                     } else {
2214                               /*
2215                                * Open the congestion window.  When in slow-start,
2216                                * open exponentially: maxseg per packet.  Otherwise,
2217                                * open linearly: maxseg per window.
2218                                */
2219                               if (tp->snd_cwnd <= tp->snd_ssthresh) {
2220                                         u_int abc_sslimit =
2221                                             (SEQ_LT(tp->snd_nxt, tp->snd_max) ?
2222                                              tp->t_maxseg : 2 * tp->t_maxseg);
2223 
2224                                         /* slow-start */
2225                                         tp->snd_cwnd += tcp_do_abc ?
2226                                             min(acked, abc_sslimit) : tp->t_maxseg;
2227                               } else {
2228                                         /* linear increase */
2229                                         tp->snd_wacked += tcp_do_abc ? acked :
2230                                             tp->t_maxseg;
2231                                         if (tp->snd_wacked >= tp->snd_cwnd) {
2232                                                   tp->snd_wacked -= tp->snd_cwnd;
2233                                                   tp->snd_cwnd += tp->t_maxseg;
2234                                         }
2235                               }
2236                               tp->snd_cwnd = min(tp->snd_cwnd,
2237                                                      TCP_MAXWIN << tp->snd_scale);
2238                               tp->snd_recover = th->th_ack - 1;
2239                     }
2240                     if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2241                               tp->snd_nxt = tp->snd_una;
2242 
2243                     /*
2244                      * If all outstanding data is acked, stop retransmit
2245                      * timer and remember to restart (more output or persist).
2246                      * If there is more data to be acked, restart retransmit
2247                      * timer, using current (possibly backed-off) value.
2248                      */
2249                     if (th->th_ack == tp->snd_max) {
2250                               tcp_callout_stop(tp, tp->tt_rexmt);
2251                               needoutput = TRUE;
2252                     } else if (!tcp_callout_active(tp, tp->tt_persist)) {
2253                               tcp_callout_reset(tp, tp->tt_rexmt, tp->t_rxtcur,
2254                                   tcp_timer_rexmt);
2255                     }
2256 
2257                     switch (tp->t_state) {
2258                     /*
2259                      * In FIN_WAIT_1 STATE in addition to the processing
2260                      * for the ESTABLISHED state if our FIN is now acknowledged
2261                      * then enter FIN_WAIT_2.
2262                      */
2263                     case TCPS_FIN_WAIT_1:
2264                               if (ourfinisacked) {
2265                                         /*
2266                                          * If we can't receive any more
2267                                          * data, then closing user can proceed.
2268                                          * Starting the timer is contrary to the
2269                                          * specification, but if we don't get a FIN
2270                                          * we'll hang forever.
2271                                          */
2272                                         if (so->so_state & SS_CANTRCVMORE) {
2273                                                   soisdisconnected(so);
2274                                                   tcp_callout_reset(tp, tp->tt_2msl,
2275                                                       tp->t_maxidle, tcp_timer_2msl);
2276                                         }
2277                                         TCP_STATE_CHANGE(tp, TCPS_FIN_WAIT_2);
2278                               }
2279                               break;
2280 
2281                     /*
2282                      * In CLOSING STATE in addition to the processing for
2283                      * the ESTABLISHED state if the ACK acknowledges our FIN
2284                      * then enter the TIME-WAIT state, otherwise ignore
2285                      * the segment.
2286                      */
2287                     case TCPS_CLOSING:
2288                               if (ourfinisacked) {
2289                                         TCP_STATE_CHANGE(tp, TCPS_TIME_WAIT);
2290                                         tcp_canceltimers(tp);
2291                                         tcp_callout_reset(tp, tp->tt_2msl,
2292                                                       2 * tcp_rmx_msl(tp),
2293                                                       tcp_timer_2msl);
2294                                         soisdisconnected(so);
2295                               }
2296                               break;
2297 
2298                     /*
2299                      * In LAST_ACK, we may still be waiting for data to drain
2300                      * and/or to be acked, as well as for the ack of our FIN.
2301                      * If our FIN is now acknowledged, delete the TCB,
2302                      * enter the closed state and return.
2303                      */
2304                     case TCPS_LAST_ACK:
2305                               if (ourfinisacked) {
2306                                         tp = tcp_close(tp);
2307                                         goto drop;
2308                               }
2309                               break;
2310 
2311                     /*
2312                      * In TIME_WAIT state the only thing that should arrive
2313                      * is a retransmission of the remote FIN.  Acknowledge
2314                      * it and restart the finack timer.
2315                      */
2316                     case TCPS_TIME_WAIT:
2317                               tcp_callout_reset(tp, tp->tt_2msl, 2 * tcp_rmx_msl(tp),
2318                                   tcp_timer_2msl);
2319                               goto dropafterack;
2320                     }
2321           }
2322 
2323 step6:
2324           /*
2325            * Update window information.
2326            * Don't look at window if no ACK: TAC's send garbage on first SYN.
2327            */
2328           if ((thflags & TH_ACK) &&
2329               acceptable_window_update(tp, th, tiwin)) {
2330                     /* keep track of pure window updates */
2331                     if (tlen == 0 && tp->snd_wl2 == th->th_ack &&
2332                         tiwin > tp->snd_wnd)
2333                               tcpstat.tcps_rcvwinupd++;
2334                     tp->snd_wnd = tiwin;
2335                     tp->snd_wl1 = th->th_seq;
2336                     tp->snd_wl2 = th->th_ack;
2337                     if (tp->snd_wnd > tp->max_sndwnd)
2338                               tp->max_sndwnd = tp->snd_wnd;
2339                     needoutput = TRUE;
2340           }
2341 
2342           /*
2343            * Process segments with URG.
2344            */
2345           if ((thflags & TH_URG) && th->th_urp &&
2346               !TCPS_HAVERCVDFIN(tp->t_state)) {
2347                     /*
2348                      * This is a kludge, but if we receive and accept
2349                      * random urgent pointers, we'll crash in
2350                      * soreceive.  It's hard to imagine someone
2351                      * actually wanting to send this much urgent data.
2352                      */
2353                     if (th->th_urp + so->so_rcv.ssb_cc > sb_max) {
2354                               th->th_urp = 0;                         /* XXX */
2355                               thflags &= ~TH_URG;           /* XXX */
2356                               goto dodata;                            /* XXX */
2357                     }
2358                     /*
2359                      * If this segment advances the known urgent pointer,
2360                      * then mark the data stream.  This should not happen
2361                      * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2362                      * a FIN has been received from the remote side.
2363                      * In these states we ignore the URG.
2364                      *
2365                      * According to RFC961 (Assigned Protocols),
2366                      * the urgent pointer points to the last octet
2367                      * of urgent data.  We continue, however,
2368                      * to consider it to indicate the first octet
2369                      * of data past the urgent section as the original
2370                      * spec states (in one of two places).
2371                      */
2372                     if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
2373                               tp->rcv_up = th->th_seq + th->th_urp;
2374                               so->so_oobmark = so->so_rcv.ssb_cc +
2375                                   (tp->rcv_up - tp->rcv_nxt) - 1;
2376                               if (so->so_oobmark == 0)
2377                                         sosetstate(so, SS_RCVATMARK);
2378                               sohasoutofband(so);
2379                               tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2380                     }
2381                     /*
2382                      * Remove out of band data so doesn't get presented to user.
2383                      * This can happen independent of advancing the URG pointer,
2384                      * but if two URG's are pending at once, some out-of-band
2385                      * data may creep in... ick.
2386                      */
2387                     if (th->th_urp <= (u_long)tlen &&
2388                         !(so->so_options & SO_OOBINLINE)) {
2389                               /* hdr drop is delayed */
2390                               tcp_pulloutofband(so, th, m, drop_hdrlen);
2391                     }
2392           } else {
2393                     /*
2394                      * If no out of band data is expected,
2395                      * pull receive urgent pointer along
2396                      * with the receive window.
2397                      */
2398                     if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2399                               tp->rcv_up = tp->rcv_nxt;
2400           }
2401 
2402 dodata:                                                               /* XXX */
2403           /*
2404            * Process the segment text, merging it into the TCP sequencing queue,
2405            * and arranging for acknowledgment of receipt if necessary.
2406            * This process logically involves adjusting tp->rcv_wnd as data
2407            * is presented to the user (this happens in tcp_usrreq.c,
2408            * case PRU_RCVD).  If a FIN has already been received on this
2409            * connection then we just ignore the text.
2410            */
2411           if ((tlen || (thflags & TH_FIN)) && !TCPS_HAVERCVDFIN(tp->t_state)) {
2412                     if (thflags & TH_FIN)
2413                               tp->t_flags |= TF_SAWFIN;
2414                     m_adj(m, drop_hdrlen);        /* delayed header drop */
2415                     /*
2416                      * Insert segment which includes th into TCP reassembly queue
2417                      * with control block tp.  Set thflags to whether reassembly now
2418                      * includes a segment with FIN.  This handles the common case
2419                      * inline (segment is the next to be received on an established
2420                      * connection, and the queue is empty), avoiding linkage into
2421                      * and removal from the queue and repetition of various
2422                      * conversions.
2423                      * Set DELACK for segments received in order, but ack
2424                      * immediately when segments are out of order (so
2425                      * fast retransmit can work).
2426                      */
2427                     if (th->th_seq == tp->rcv_nxt &&
2428                         TAILQ_EMPTY(&tp->t_segq) &&
2429                         TCPS_HAVEESTABLISHED(tp->t_state)) {
2430                               if (thflags & TH_FIN)
2431                                         tp->t_flags |= TF_QUEDFIN;
2432                               if (DELAY_ACK(tp)) {
2433                                         tcp_callout_reset(tp, tp->tt_delack,
2434                                             tcp_delacktime, tcp_timer_delack);
2435                               } else {
2436                                         tp->t_flags |= TF_ACKNOW;
2437                               }
2438                               tp->rcv_nxt += tlen;
2439                               thflags = th->th_flags & TH_FIN;
2440                               tcpstat.tcps_rcvpack++;
2441                               tcpstat.tcps_rcvbyte += tlen;
2442                               ND6_HINT(tp);
2443                               if (so->so_state & SS_CANTRCVMORE) {
2444                                         m_freem(m);
2445                               } else {
2446                                         lwkt_gettoken(&so->so_rcv.ssb_token);
2447                                         ssb_appendstream(&so->so_rcv, m);
2448                                         lwkt_reltoken(&so->so_rcv.ssb_token);
2449                               }
2450                               sorwakeup(so);
2451                     } else {
2452                               if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
2453                                         /* Initialize SACK report block. */
2454                                         tp->reportblk.rblk_start = th->th_seq;
2455                                         tp->reportblk.rblk_end = TCP_SACK_BLKEND(
2456                                             th->th_seq + tlen, thflags);
2457                               }
2458                               thflags = tcp_reass(tp, th, &tlen, m);
2459                               tp->t_flags |= TF_ACKNOW;
2460                     }
2461 
2462                     /*
2463                      * Note the amount of data that peer has sent into
2464                      * our window, in order to estimate the sender's
2465                      * buffer size.
2466                      */
2467                     len = so->so_rcv.ssb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2468           } else {
2469                     m_freem(m);
2470                     thflags &= ~TH_FIN;
2471           }
2472 
2473           /*
2474            * If FIN is received ACK the FIN and let the user know
2475            * that the connection is closing.
2476            */
2477           if (thflags & TH_FIN) {
2478                     if (!TCPS_HAVERCVDFIN(tp->t_state)) {
2479                               socantrcvmore(so);
2480                               /*
2481                                * If connection is half-synchronized
2482                                * (ie NEEDSYN flag on) then delay ACK,
2483                                * so it may be piggybacked when SYN is sent.
2484                                * Otherwise, since we received a FIN then no
2485                                * more input can be expected, send ACK now.
2486                                */
2487                               if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) {
2488                                         tcp_callout_reset(tp, tp->tt_delack,
2489                                             tcp_delacktime, tcp_timer_delack);
2490                               } else {
2491                                         tp->t_flags |= TF_ACKNOW;
2492                               }
2493                               tp->rcv_nxt++;
2494                     }
2495 
2496                     switch (tp->t_state) {
2497                     /*
2498                      * In SYN_RECEIVED and ESTABLISHED STATES
2499                      * enter the CLOSE_WAIT state.
2500                      */
2501                     case TCPS_SYN_RECEIVED:
2502                               tp->t_starttime = ticks;
2503                               /*FALLTHROUGH*/
2504                     case TCPS_ESTABLISHED:
2505                               TCP_STATE_CHANGE(tp, TCPS_CLOSE_WAIT);
2506                               break;
2507 
2508                     /*
2509                      * If still in FIN_WAIT_1 STATE FIN has not been acked so
2510                      * enter the CLOSING state.
2511                      */
2512                     case TCPS_FIN_WAIT_1:
2513                               TCP_STATE_CHANGE(tp, TCPS_CLOSING);
2514                               break;
2515 
2516                     /*
2517                      * In FIN_WAIT_2 state enter the TIME_WAIT state,
2518                      * starting the time-wait timer, turning off the other
2519                      * standard timers.
2520                      */
2521                     case TCPS_FIN_WAIT_2:
2522                               TCP_STATE_CHANGE(tp, TCPS_TIME_WAIT);
2523                               tcp_canceltimers(tp);
2524                               tcp_callout_reset(tp, tp->tt_2msl, 2 * tcp_rmx_msl(tp),
2525                                             tcp_timer_2msl);
2526                               soisdisconnected(so);
2527                               break;
2528 
2529                     /*
2530                      * In TIME_WAIT state restart the 2 MSL time_wait timer.
2531                      */
2532                     case TCPS_TIME_WAIT:
2533                               tcp_callout_reset(tp, tp->tt_2msl, 2 * tcp_rmx_msl(tp),
2534                                   tcp_timer_2msl);
2535                               break;
2536                     }
2537           }
2538 
2539 #ifdef TCPDEBUG
2540           if (so->so_options & SO_DEBUG)
2541                     tcp_trace(TA_INPUT, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
2542 #endif
2543 
2544           /*
2545            * Delayed duplicated ACK processing
2546            */
2547           if (delayed_dupack && tcp_recv_dupack(tp, th_dupack, to_flags))
2548                     needoutput = FALSE;
2549 
2550           /*
2551            * Return any desired output.
2552            */
2553           if ((tp->t_flags & TF_ACKNOW) ||
2554               (needoutput && tcp_sack_report_needed(tp))) {
2555                     tcp_output_cancel(tp);
2556                     tcp_output_fair(tp);
2557           } else if (needoutput && !tcp_output_pending(tp)) {
2558                     tcp_output_fair(tp);
2559           }
2560           tcp_sack_report_cleanup(tp);
2561           return(IPPROTO_DONE);
2562 
2563 dropafterack:
2564           /*
2565            * Generate an ACK dropping incoming segment if it occupies
2566            * sequence space, where the ACK reflects our state.
2567            *
2568            * We can now skip the test for the RST flag since all
2569            * paths to this code happen after packets containing
2570            * RST have been dropped.
2571            *
2572            * In the SYN-RECEIVED state, don't send an ACK unless the
2573            * segment we received passes the SYN-RECEIVED ACK test.
2574            * If it fails send a RST.  This breaks the loop in the
2575            * "LAND" DoS attack, and also prevents an ACK storm
2576            * between two listening ports that have been sent forged
2577            * SYN segments, each with the source address of the other.
2578            */
2579           if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2580               (SEQ_GT(tp->snd_una, th->th_ack) ||
2581                SEQ_GT(th->th_ack, tp->snd_max)) ) {
2582                     rstreason = BANDLIM_RST_OPENPORT;
2583                     goto dropwithreset;
2584           }
2585 #ifdef TCPDEBUG
2586           if (so->so_options & SO_DEBUG)
2587                     tcp_trace(TA_DROP, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
2588 #endif
2589           m_freem(m);
2590           tp->t_flags |= TF_ACKNOW;
2591           tcp_output(tp);
2592           tcp_sack_report_cleanup(tp);
2593           return(IPPROTO_DONE);
2594 
2595 dropwithreset:
2596           /*
2597            * Generate a RST, dropping incoming segment.
2598            * Make ACK acceptable to originator of segment.
2599            * Don't bother to respond if destination was broadcast/multicast.
2600            */
2601           if ((thflags & TH_RST) || m->m_flags & (M_BCAST | M_MCAST))
2602                     goto drop;
2603           if (isipv6) {
2604                     if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2605                         IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2606                               goto drop;
2607           } else {
2608                     if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2609                         IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2610                         ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2611                         in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2612                               goto drop;
2613           }
2614           /* IPv6 anycast check is done at tcp6_input() */
2615 
2616           /*
2617            * Perform bandwidth limiting.
2618            */
2619 #ifdef ICMP_BANDLIM
2620           if (badport_bandlim(rstreason) < 0)
2621                     goto drop;
2622 #endif
2623 
2624 #ifdef TCPDEBUG
2625           if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2626                     tcp_trace(TA_DROP, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
2627 #endif
2628           if (thflags & TH_ACK)
2629                     /* mtod() below is safe as long as hdr dropping is delayed */
2630                     tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
2631                                   TH_RST);
2632           else {
2633                     if (thflags & TH_SYN)
2634                               tlen++;
2635                     /* mtod() below is safe as long as hdr dropping is delayed */
2636                     tcp_respond(tp, mtod(m, void *), th, m, th->th_seq + tlen,
2637                                   (tcp_seq)0, TH_RST | TH_ACK);
2638           }
2639           if (tp != NULL)
2640                     tcp_sack_report_cleanup(tp);
2641           return(IPPROTO_DONE);
2642 
2643 drop:
2644           /*
2645            * Drop space held by incoming segment and return.
2646            */
2647 #ifdef TCPDEBUG
2648           if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2649                     tcp_trace(TA_DROP, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
2650 #endif
2651           m_freem(m);
2652           if (tp != NULL)
2653                     tcp_sack_report_cleanup(tp);
2654           return(IPPROTO_DONE);
2655 }
2656 
2657 /*
2658  * Parse TCP options and place in tcpopt.
2659  */
2660 static void
tcp_dooptions(struct tcpopt * to,u_char * cp,int cnt,boolean_t is_syn,tcp_seq ack)2661 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn,
2662     tcp_seq ack)
2663 {
2664           int opt, optlen, i;
2665 
2666           to->to_flags = 0;
2667           for (; cnt > 0; cnt -= optlen, cp += optlen) {
2668                     opt = cp[0];
2669                     if (opt == TCPOPT_EOL)
2670                               break;
2671                     if (opt == TCPOPT_NOP)
2672                               optlen = 1;
2673                     else {
2674                               if (cnt < 2)
2675                                         break;
2676                               optlen = cp[1];
2677                               if (optlen < 2 || optlen > cnt)
2678                                         break;
2679                     }
2680                     switch (opt) {
2681                     case TCPOPT_MAXSEG:
2682                               if (optlen != TCPOLEN_MAXSEG)
2683                                         continue;
2684                               if (!is_syn)
2685                                         continue;
2686                               to->to_flags |= TOF_MSS;
2687                               bcopy(cp + 2, &to->to_mss, sizeof to->to_mss);
2688                               to->to_mss = ntohs(to->to_mss);
2689                               break;
2690                     case TCPOPT_WINDOW:
2691                               if (optlen != TCPOLEN_WINDOW)
2692                                         continue;
2693                               if (!is_syn)
2694                                         continue;
2695                               to->to_flags |= TOF_SCALE;
2696                               to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2697                               break;
2698                     case TCPOPT_TIMESTAMP:
2699                               if (optlen != TCPOLEN_TIMESTAMP)
2700                                         continue;
2701                               to->to_flags |= TOF_TS;
2702                               bcopy(cp + 2, &to->to_tsval, sizeof to->to_tsval);
2703                               to->to_tsval = ntohl(to->to_tsval);
2704                               bcopy(cp + 6, &to->to_tsecr, sizeof to->to_tsecr);
2705                               to->to_tsecr = ntohl(to->to_tsecr);
2706                               /*
2707                                * If echoed timestamp is later than the current time,
2708                                * fall back to non RFC1323 RTT calculation.
2709                                */
2710                               if (to->to_tsecr != 0 && TSTMP_GT(to->to_tsecr, ticks))
2711                                         to->to_tsecr = 0;
2712                               break;
2713                     case TCPOPT_SACK_PERMITTED:
2714                               if (optlen != TCPOLEN_SACK_PERMITTED)
2715                                         continue;
2716                               if (!is_syn)
2717                                         continue;
2718                               to->to_flags |= TOF_SACK_PERMITTED;
2719                               break;
2720                     case TCPOPT_SACK:
2721                               if ((optlen - 2) & 0x07)      /* not multiple of 8 */
2722                                         continue;
2723                               to->to_nsackblocks = (optlen - 2) / 8;
2724                               to->to_sackblocks = (struct raw_sackblock *) (cp + 2);
2725                               to->to_flags |= TOF_SACK;
2726                               for (i = 0; i < to->to_nsackblocks; i++) {
2727                                         struct raw_sackblock *r = &to->to_sackblocks[i];
2728 
2729                                         r->rblk_start = ntohl(r->rblk_start);
2730                                         r->rblk_end = ntohl(r->rblk_end);
2731 
2732                                         if (SEQ_LEQ(r->rblk_end, r->rblk_start)) {
2733                                                   /*
2734                                                    * Invalid SACK block; discard all
2735                                                    * SACK blocks
2736                                                    */
2737                                                   tcpstat.tcps_rcvbadsackopt++;
2738                                                   to->to_nsackblocks = 0;
2739                                                   to->to_sackblocks = NULL;
2740                                                   to->to_flags &= ~TOF_SACK;
2741                                                   break;
2742                                         }
2743                               }
2744                               if ((to->to_flags & TOF_SACK) &&
2745                                   tcp_sack_ndsack_blocks(to->to_sackblocks,
2746                                   to->to_nsackblocks, ack))
2747                                         to->to_flags |= TOF_DSACK;
2748                               break;
2749 #ifdef TCP_SIGNATURE
2750                     /*
2751                      * XXX In order to reply to a host which has set the
2752                      * TCP_SIGNATURE option in its initial SYN, we have to
2753                      * record the fact that the option was observed here
2754                      * for the syncache code to perform the correct response.
2755                      */
2756                     case TCPOPT_SIGNATURE:
2757                               if (optlen != TCPOLEN_SIGNATURE)
2758                                         continue;
2759                               to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
2760                               break;
2761 #endif /* TCP_SIGNATURE */
2762                     default:
2763                               continue;
2764                     }
2765           }
2766 }
2767 
2768 /*
2769  * Pull out of band byte out of a segment so
2770  * it doesn't appear in the user's data queue.
2771  * It is still reflected in the segment length for
2772  * sequencing purposes.
2773  * "off" is the delayed to be dropped hdrlen.
2774  */
2775 static void
tcp_pulloutofband(struct socket * so,struct tcphdr * th,struct mbuf * m,int off)2776 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off)
2777 {
2778           int cnt = off + th->th_urp - 1;
2779 
2780           while (cnt >= 0) {
2781                     if (m->m_len > cnt) {
2782                               char *cp = mtod(m, caddr_t) + cnt;
2783                               struct tcpcb *tp = sototcpcb(so);
2784 
2785                               tp->t_iobc = *cp;
2786                               tp->t_oobflags |= TCPOOB_HAVEDATA;
2787                               bcopy(cp + 1, cp, m->m_len - cnt - 1);
2788                               m->m_len--;
2789                               if (m->m_flags & M_PKTHDR)
2790                                         m->m_pkthdr.len--;
2791                               return;
2792                     }
2793                     cnt -= m->m_len;
2794                     m = m->m_next;
2795                     if (m == NULL)
2796                               break;
2797           }
2798           panic("tcp_pulloutofband");
2799 }
2800 
2801 /*
2802  * Collect new round-trip time estimate and update averages and current
2803  * timeout.
2804  */
2805 static void
tcp_xmit_timer(struct tcpcb * tp,int rtt,tcp_seq ack)2806 tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack)
2807 {
2808           int rebaserto = 0;
2809 
2810           tcpstat.tcps_rttupdated++;
2811           tp->t_rttupdated++;
2812           if ((tp->rxt_flags & TRXT_F_REBASERTO) &&
2813               SEQ_GT(ack, tp->snd_max_prev)) {
2814 #ifdef DEBUG_EIFEL_RESPONSE
2815                     kprintf("srtt/rttvar, prev %d/%d, cur %d/%d, ",
2816                         tp->t_srtt_prev, tp->t_rttvar_prev,
2817                         tp->t_srtt, tp->t_rttvar);
2818 #endif
2819 
2820                     tcpstat.tcps_eifelresponse++;
2821                     rebaserto = 1;
2822                     tp->rxt_flags &= ~TRXT_F_REBASERTO;
2823                     tp->t_srtt = max(tp->t_srtt_prev, (rtt << TCP_RTT_SHIFT));
2824                     tp->t_rttvar = max(tp->t_rttvar_prev,
2825                         (rtt << (TCP_RTTVAR_SHIFT - 1)));
2826                     if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
2827                               tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2828 
2829 #ifdef DEBUG_EIFEL_RESPONSE
2830                     kprintf("new %d/%d ", tp->t_srtt, tp->t_rttvar);
2831 #endif
2832           } else if (tp->t_srtt != 0) {
2833                     int delta;
2834 
2835                     /*
2836                      * srtt is stored as fixed point with 5 bits after the
2837                      * binary point (i.e., scaled by 32).  The following magic
2838                      * is equivalent to the smoothing algorithm in rfc793 with
2839                      * an alpha of .875 (srtt = rtt/32 + srtt*31/32 in fixed
2840                      * point).  Adjust rtt to origin 0.
2841                      */
2842                     delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2843                               - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2844 
2845                     if ((tp->t_srtt += delta) <= 0)
2846                               tp->t_srtt = 1;
2847 
2848                     /*
2849                      * We accumulate a smoothed rtt variance (actually, a
2850                      * smoothed mean difference), then set the retransmit
2851                      * timer to smoothed rtt + 4 times the smoothed variance.
2852                      * rttvar is stored as fixed point with 4 bits after the
2853                      * binary point (scaled by 16).  The following is
2854                      * equivalent to rfc793 smoothing with an alpha of .75
2855                      * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
2856                      * rfc793's wired-in beta.
2857                      */
2858                     if (delta < 0)
2859                               delta = -delta;
2860                     delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2861                     if ((tp->t_rttvar += delta) <= 0)
2862                               tp->t_rttvar = 1;
2863                     if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
2864                               tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2865           } else {
2866                     /*
2867                      * No rtt measurement yet - use the unsmoothed rtt.
2868                      * Set the variance to half the rtt (so our first
2869                      * retransmit happens at 3*rtt).
2870                      */
2871                     tp->t_srtt = rtt << TCP_RTT_SHIFT;
2872                     tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2873                     tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2874           }
2875           tp->t_rtttime = 0;
2876           tp->t_rxtshift = 0;
2877 
2878 #ifdef DEBUG_EIFEL_RESPONSE
2879           if (rebaserto) {
2880                     kprintf("| rxtcur prev %d, old %d, ",
2881                         tp->t_rxtcur_prev, tp->t_rxtcur);
2882           }
2883 #endif
2884 
2885           /*
2886            * the retransmit should happen at rtt + 4 * rttvar.
2887            * Because of the way we do the smoothing, srtt and rttvar
2888            * will each average +1/2 tick of bias.  When we compute
2889            * the retransmit timer, we want 1/2 tick of rounding and
2890            * 1 extra tick because of +-1/2 tick uncertainty in the
2891            * firing of the timer.  The bias will give us exactly the
2892            * 1.5 tick we need.  But, because the bias is
2893            * statistical, we have to test that we don't drop below
2894            * the minimum feasible timer (which is 2 ticks).
2895            */
2896           TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2897                           max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2898 
2899           if (rebaserto) {
2900                     if (tp->t_rxtcur < tp->t_rxtcur_prev + tcp_eifel_rtoinc) {
2901                               /*
2902                                * RFC4015 requires that the new RTO is at least
2903                                * 2*G (tcp_eifel_rtoinc) greater then the RTO
2904                                * (t_rxtcur_prev) when the spurious retransmit
2905                                * timeout happens.
2906                                *
2907                                * The above condition could be true, if the SRTT
2908                                * and RTTVAR used to calculate t_rxtcur_prev
2909                                * resulted in a value less than t_rttmin.  So
2910                                * simply increasing SRTT by tcp_eifel_rtoinc when
2911                                * preparing for the Eifel response could not ensure
2912                                * that the new RTO will be tcp_eifel_rtoinc greater
2913                                * t_rxtcur_prev.
2914                                */
2915                               tp->t_rxtcur = tp->t_rxtcur_prev + tcp_eifel_rtoinc;
2916                     }
2917 #ifdef DEBUG_EIFEL_RESPONSE
2918                     kprintf("new %d\n", tp->t_rxtcur);
2919 #endif
2920           }
2921 
2922           /*
2923            * We received an ack for a packet that wasn't retransmitted;
2924            * it is probably safe to discard any error indications we've
2925            * received recently.  This isn't quite right, but close enough
2926            * for now (a route might have failed after we sent a segment,
2927            * and the return path might not be symmetrical).
2928            */
2929           tp->t_softerror = 0;
2930 }
2931 
2932 /*
2933  * Determine a reasonable value for maxseg size.
2934  * If the route is known, check route for mtu.
2935  * If none, use an mss that can be handled on the outgoing
2936  * interface without forcing IP to fragment; if bigger than
2937  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2938  * to utilize large mbufs.  If no route is found, route has no mtu,
2939  * or the destination isn't local, use a default, hopefully conservative
2940  * size (usually 512 or the default IP max size, but no more than the mtu
2941  * of the interface), as we can't discover anything about intervening
2942  * gateways or networks.  We also initialize the congestion/slow start
2943  * window to be a single segment if the destination isn't local.
2944  *
2945  * Also take into account the space needed for options that we
2946  * send regularly.  Make maxseg shorter by that amount to assure
2947  * that we can send maxseg amount of data even when the options
2948  * are present.  Store the upper limit of the length of options plus
2949  * data in maxopd.
2950  *
2951  * NOTE that this routine is only called when we process an incoming
2952  * segment, for outgoing segments only tcp_mssopt is called.
2953  */
2954 static void
tcp_rmx_mss(struct tcpcb * tp,struct rtentry * rt,int offer)2955 tcp_rmx_mss(struct tcpcb *tp, struct rtentry *rt, int offer)
2956 {
2957           struct ifnet *ifp;
2958           int mss;
2959           u_long bufsize;
2960           struct inpcb *inp = tp->t_inpcb;
2961           struct socket *so;
2962 #ifdef INET6
2963           boolean_t isipv6 = INP_ISIPV6(inp);
2964           size_t min_protoh = isipv6 ?
2965                                   sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
2966                                   sizeof(struct tcpiphdr);
2967 #else
2968           const boolean_t isipv6 = FALSE;
2969           const size_t min_protoh = sizeof(struct tcpiphdr);
2970 #endif
2971 
2972           if (rt == NULL) {
2973                     tp->t_maxopd = tp->t_maxseg =
2974                         (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
2975                     return;
2976           }
2977           ifp = rt->rt_ifp;
2978           so = inp->inp_socket;
2979 
2980           /*
2981            * Offer == 0 means that there was no MSS on the SYN segment,
2982            * in this case we use either the interface mtu or tcp_mssdflt.
2983            *
2984            * An offer which is too large will be cut down later.
2985            */
2986           if (offer == 0) {
2987                     if (isipv6) {
2988                               if (in6_localaddr(&inp->in6p_faddr))
2989                                         offer = IN6_LINKMTU(rt->rt_ifp) - min_protoh;
2990                               else
2991                                         offer = tcp_v6mssdflt;
2992                     } else {
2993                               if (in_localaddr(inp->inp_faddr))
2994                                         offer = ifp->if_mtu - min_protoh;
2995                               else
2996                                         offer = tcp_mssdflt;
2997                     }
2998           }
2999 
3000           /*
3001            * Prevent DoS attack with too small MSS. Round up
3002            * to at least minmss.
3003            *
3004            * Sanity check: make sure that maxopd will be large
3005            * enough to allow some data on segments even is the
3006            * all the option space is used (40bytes).  Otherwise
3007            * funny things may happen in tcp_output.
3008            */
3009           offer = max(offer, tcp_minmss);
3010           offer = max(offer, 64);
3011 
3012           rt->rt_rmx.rmx_mssopt = offer;
3013 
3014           /*
3015            * if there's an mtu associated with the route, use it
3016            * else, use the link mtu.  Take the smaller of mss or offer
3017            * as our final mss.
3018            */
3019           if (rt->rt_rmx.rmx_mtu) {
3020                     mss = rt->rt_rmx.rmx_mtu;
3021           } else {
3022                     if (isipv6)
3023                               mss = IN6_LINKMTU(rt->rt_ifp);
3024                     else
3025                               mss = ifp->if_mtu;
3026           }
3027           mss -= min_protoh;
3028           mss = min(mss, offer);
3029 
3030           /*
3031            * maxopd stores the maximum length of data AND options
3032            * in a segment; maxseg is the amount of data in a normal
3033            * segment.  We need to store this value (maxopd) apart
3034            * from maxseg, because now every segment carries options
3035            * and thus we normally have somewhat less data in segments.
3036            */
3037           tp->t_maxopd = mss;
3038 
3039           if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
3040               ((tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
3041                     mss -= TCPOLEN_TSTAMP_APPA;
3042 
3043 #if       (MCLBYTES & (MCLBYTES - 1)) == 0
3044           if (mss > MCLBYTES)
3045                     mss &= ~(MCLBYTES-1);
3046 #else
3047           if (mss > MCLBYTES)
3048                     mss = rounddown(mss, MCLBYTES);
3049 #endif
3050           /*
3051            * If there's a pipesize, change the socket buffer
3052            * to that size.  Make the socket buffers an integral
3053            * number of mss units; if the mss is larger than
3054            * the socket buffer, decrease the mss.
3055            */
3056 #ifdef RTV_SPIPE
3057           if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
3058 #endif
3059                     bufsize = so->so_snd.ssb_hiwat;
3060           if (bufsize < mss)
3061                     mss = bufsize;
3062           else {
3063                     bufsize = roundup(bufsize, mss);
3064                     if (bufsize > sb_max)
3065                               bufsize = sb_max;
3066                     if (bufsize > so->so_snd.ssb_hiwat)
3067                               ssb_reserve(&so->so_snd, bufsize, so, NULL);
3068           }
3069           tp->t_maxseg = mss;
3070 
3071 #ifdef RTV_RPIPE
3072           if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
3073 #endif
3074                     bufsize = so->so_rcv.ssb_hiwat;
3075           if (bufsize > mss) {
3076                     bufsize = roundup(bufsize, mss);
3077                     if (bufsize > sb_max)
3078                               bufsize = sb_max;
3079                     if (bufsize > so->so_rcv.ssb_hiwat) {
3080                               lwkt_gettoken(&so->so_rcv.ssb_token);
3081                               ssb_reserve(&so->so_rcv, bufsize, so, NULL);
3082                               lwkt_reltoken(&so->so_rcv.ssb_token);
3083                     }
3084           }
3085 
3086           /*
3087            * Set the slow-start flight size
3088            *
3089            * NOTE: t_maxseg must have been configured!
3090            */
3091           tp->snd_cwnd = tcp_initial_window(tp);
3092 
3093           if (rt->rt_rmx.rmx_ssthresh) {
3094                     /*
3095                      * There's some sort of gateway or interface
3096                      * buffer limit on the path.  Use this to set
3097                      * the slow start threshhold, but set the
3098                      * threshold to no less than 2*mss.
3099                      */
3100                     tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
3101                     tcpstat.tcps_usedssthresh++;
3102           }
3103 }
3104 
3105 static void
tcp_rmx_rtt(struct tcpcb * tp,struct rtentry * rt)3106 tcp_rmx_rtt(struct tcpcb *tp, struct rtentry *rt)
3107 {
3108           int rtt;
3109 
3110           if (rt == NULL)
3111                     return;
3112 
3113           /*
3114            * Check if there's an initial rtt or rttvar.  Convert
3115            * from the route-table units to scaled multiples of
3116            * the slow timeout timer.
3117            */
3118           if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
3119                     /*
3120                      * XXX the lock bit for RTT indicates that the value
3121                      * is also a minimum value; this is subject to time.
3122                      */
3123                     if (rt->rt_rmx.rmx_locks & RTV_RTT)
3124                               tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
3125                     tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
3126                     tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
3127                     tcpstat.tcps_usedrtt++;
3128                     if (rt->rt_rmx.rmx_rttvar) {
3129                               tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
3130                                   (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
3131                               tcpstat.tcps_usedrttvar++;
3132                     } else {
3133                               /* default variation is +- 1 rtt */
3134                               tp->t_rttvar =
3135                                   tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
3136                     }
3137                     TCPT_RANGESET(tp->t_rxtcur,
3138                                     ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
3139                                     tp->t_rttmin, TCPTV_REXMTMAX);
3140           }
3141 }
3142 
3143 void
tcp_rmx_init(struct tcpcb * tp,int offer)3144 tcp_rmx_init(struct tcpcb *tp, int offer)
3145 {
3146           struct inpcb *inp = tp->t_inpcb;
3147 #ifdef INET6
3148           boolean_t isipv6 = INP_ISIPV6(inp);
3149 #else
3150           const boolean_t isipv6 = FALSE;
3151 #endif
3152           struct rtentry *rt;
3153 
3154           if (isipv6)
3155                     rt = tcp_rtlookup6(&inp->inp_inc);
3156           else
3157                     rt = tcp_rtlookup(&inp->inp_inc);
3158 
3159           tcp_rmx_mss(tp, rt, offer);
3160           tcp_rmx_rtt(tp, rt);
3161 
3162           if (rt != NULL && !tcp_ncr_linklocal && (rt->rt_flags & RTF_LLINFO)) {
3163                     /* Don't enable NCR on link-local network. */
3164                     tp->t_flags &= ~TF_NCR;
3165           }
3166 }
3167 
3168 /*
3169  * Determine the MSS option to send on an outgoing SYN.
3170  */
3171 int
tcp_mssopt(struct tcpcb * tp)3172 tcp_mssopt(struct tcpcb *tp)
3173 {
3174           struct rtentry *rt;
3175 #ifdef INET6
3176           boolean_t isipv6 = INP_ISIPV6(tp->t_inpcb);
3177           int min_protoh = isipv6 ?
3178                                    sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
3179                                    sizeof(struct tcpiphdr);
3180 #else
3181           const boolean_t isipv6 = FALSE;
3182           const size_t min_protoh = sizeof(struct tcpiphdr);
3183 #endif
3184 
3185           if (isipv6)
3186                     rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
3187           else
3188                     rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
3189           if (rt == NULL)
3190                     return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
3191 
3192 #ifdef INET6
3193           return ((isipv6 ? IN6_LINKMTU(rt->rt_ifp) : rt->rt_ifp->if_mtu) -
3194               min_protoh);
3195 #else
3196           return (rt->rt_ifp->if_mtu - min_protoh);
3197 #endif
3198 }
3199 
3200 /*
3201  * When a partial ack arrives, force the retransmission of the
3202  * next unacknowledged segment.  Do not exit Fast Recovery.
3203  *
3204  * Implement the Slow-but-Steady variant of NewReno by restarting the
3205  * the retransmission timer.  Turn it off here so it can be restarted
3206  * later in tcp_output().
3207  */
3208 static void
tcp_newreno_partial_ack(struct tcpcb * tp,struct tcphdr * th,int acked)3209 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th, int acked)
3210 {
3211           tcp_seq old_snd_nxt = tp->snd_nxt;
3212           u_long ocwnd = tp->snd_cwnd;
3213 
3214           tcp_callout_stop(tp, tp->tt_rexmt);
3215           tp->t_rtttime = 0;
3216           tp->snd_nxt = th->th_ack;
3217           /* Set snd_cwnd to one segment beyond acknowledged offset. */
3218           tp->snd_cwnd = tp->t_maxseg;
3219           tp->t_flags |= TF_ACKNOW;
3220           tcp_output(tp);
3221           if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
3222                     tp->snd_nxt = old_snd_nxt;
3223           /* partial window deflation */
3224           if (ocwnd > acked)
3225                     tp->snd_cwnd = ocwnd - acked + tp->t_maxseg;
3226           else
3227                     tp->snd_cwnd = tp->t_maxseg;
3228 }
3229 
3230 /*
3231  * In contrast to the Slow-but-Steady NewReno variant,
3232  * we do not reset the retransmission timer for SACK retransmissions,
3233  * except when retransmitting snd_una.
3234  */
3235 static void
tcp_sack_rexmt(struct tcpcb * tp,boolean_t force)3236 tcp_sack_rexmt(struct tcpcb *tp, boolean_t force)
3237 {
3238           tcp_seq old_snd_nxt = tp->snd_nxt;
3239           u_long ocwnd = tp->snd_cwnd;
3240           uint32_t pipe;
3241           int nseg = 0;                 /* consecutive new segments */
3242           int nseg_rexmt = 0; /* retransmitted segments */
3243           int maxrexmt = 0;
3244 
3245           if (force) {
3246                     uint32_t unsacked = tcp_sack_first_unsacked_len(tp);
3247 
3248                     /*
3249                      * Try to fill the first hole in the receiver's
3250                      * reassemble queue.
3251                      */
3252                     maxrexmt = howmany(unsacked, tp->t_maxseg);
3253                     if (maxrexmt > tcp_force_sackrxt)
3254                               maxrexmt = tcp_force_sackrxt;
3255           }
3256 
3257           tp->t_rtttime = 0;
3258           pipe = tcp_sack_compute_pipe(tp);
3259           while (((tcp_seq_diff_t)(ocwnd - pipe) >= (tcp_seq_diff_t)tp->t_maxseg
3260                   || (force && nseg_rexmt < maxrexmt && nseg == 0)) &&
3261               (!tcp_do_smartsack || nseg < TCP_SACK_MAXBURST)) {
3262                     tcp_seq old_snd_max, old_rexmt_high, nextrexmt;
3263                     uint32_t sent, seglen;
3264                     boolean_t rescue;
3265                     int error;
3266 
3267                     old_rexmt_high = tp->rexmt_high;
3268                     if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) {
3269                               tp->rexmt_high = old_rexmt_high;
3270                               break;
3271                     }
3272 
3273                     /*
3274                      * If the next tranmission is a rescue retranmission,
3275                      * we check whether we have already sent some data
3276                      * (either new segments or retransmitted segments)
3277                      * into the the network or not.  Since the idea of rescue
3278                      * retransmission is to sustain ACK clock, as long as
3279                      * some segments are in the network, ACK clock will be
3280                      * kept ticking.
3281                      */
3282                     if (rescue && (nseg_rexmt > 0 || nseg > 0)) {
3283                               tp->rexmt_high = old_rexmt_high;
3284                               break;
3285                     }
3286 
3287                     if (nextrexmt == tp->snd_max)
3288                               ++nseg;
3289                     else
3290                               ++nseg_rexmt;
3291                     tp->snd_nxt = nextrexmt;
3292                     tp->snd_cwnd = nextrexmt - tp->snd_una + seglen;
3293                     old_snd_max = tp->snd_max;
3294                     if (nextrexmt == tp->snd_una)
3295                               tcp_callout_stop(tp, tp->tt_rexmt);
3296                     tp->t_flags |= TF_XMITNOW;
3297                     error = tcp_output(tp);
3298                     if (error != 0) {
3299                               tp->rexmt_high = old_rexmt_high;
3300                               break;
3301                     }
3302                     sent = tp->snd_nxt - nextrexmt;
3303                     if (sent <= 0) {
3304                               tp->rexmt_high = old_rexmt_high;
3305                               break;
3306                     }
3307                     pipe += sent;
3308                     tcpstat.tcps_sndsackpack++;
3309                     tcpstat.tcps_sndsackbyte += sent;
3310 
3311                     if (rescue) {
3312                               tcpstat.tcps_sackrescue++;
3313                               tp->rexmt_rescue = tp->snd_nxt;
3314                               tp->sack_flags |= TSACK_F_SACKRESCUED;
3315                               break;
3316                     }
3317                     if (SEQ_LT(nextrexmt, old_snd_max) &&
3318                         SEQ_LT(tp->rexmt_high, tp->snd_nxt)) {
3319                               tp->rexmt_high = seq_min(tp->snd_nxt, old_snd_max);
3320                               if (tcp_aggressive_rescuesack &&
3321                                   (tp->sack_flags & TSACK_F_SACKRESCUED) &&
3322                                   SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
3323                                         /* Drag RescueRxt along with HighRxt */
3324                                         tp->rexmt_rescue = tp->rexmt_high;
3325                               }
3326                     }
3327           }
3328           if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
3329                     tp->snd_nxt = old_snd_nxt;
3330           tp->snd_cwnd = ocwnd;
3331 }
3332 
3333 /*
3334  * Return TRUE, if some new segments are sent
3335  */
3336 static boolean_t
tcp_sack_limitedxmit(struct tcpcb * tp)3337 tcp_sack_limitedxmit(struct tcpcb *tp)
3338 {
3339           tcp_seq oldsndnxt = tp->snd_nxt;
3340           tcp_seq oldsndmax = tp->snd_max;
3341           u_long ocwnd = tp->snd_cwnd;
3342           uint32_t pipe, sent;
3343           boolean_t ret = FALSE;
3344           tcp_seq_diff_t cwnd_left;
3345           tcp_seq next;
3346 
3347           tp->rexmt_high = tp->snd_una - 1;
3348           pipe = tcp_sack_compute_pipe(tp);
3349           cwnd_left = (tcp_seq_diff_t)(ocwnd - pipe);
3350           if (cwnd_left < (tcp_seq_diff_t)tp->t_maxseg)
3351                     return FALSE;
3352 
3353           if (tcp_do_smartsack)
3354                     cwnd_left = ulmin(cwnd_left, tp->t_maxseg * TCP_SACK_MAXBURST);
3355 
3356           next = tp->snd_nxt = tp->snd_max;
3357           tp->snd_cwnd = tp->snd_nxt - tp->snd_una +
3358               rounddown(cwnd_left, tp->t_maxseg);
3359 
3360           tp->t_flags |= TF_XMITNOW;
3361           tcp_output(tp);
3362 
3363           sent = tp->snd_nxt - next;
3364           if (sent > 0) {
3365                     tcpstat.tcps_sndlimited += howmany(sent, tp->t_maxseg);
3366                     ret = TRUE;
3367           }
3368 
3369           if (SEQ_LT(oldsndnxt, oldsndmax)) {
3370                     KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
3371                         ("snd_una moved in other threads"));
3372                     tp->snd_nxt = oldsndnxt;
3373           }
3374           tp->snd_cwnd = ocwnd;
3375 
3376           if (ret && TCP_DO_NCR(tp))
3377                     tcp_ncr_update_rxtthresh(tp);
3378 
3379           return ret;
3380 }
3381 
3382 /*
3383  * Reset idle time and keep-alive timer, typically called when a valid
3384  * tcp packet is received but may also be called when FASTKEEP is set
3385  * to prevent the previous long-timeout from calculating to a drop.
3386  *
3387  * Only update t_rcvtime for non-SYN packets.
3388  *
3389  * Handle the case where one side thinks the connection is established
3390  * but the other side has, say, rebooted without cleaning out the
3391  * connection.   The SYNs could be construed as an attack and wind
3392  * up ignored, but in case it isn't an attack we can validate the
3393  * connection by forcing a keepalive.
3394  */
3395 void
tcp_timer_keep_activity(struct tcpcb * tp,int thflags)3396 tcp_timer_keep_activity(struct tcpcb *tp, int thflags)
3397 {
3398           if (TCPS_HAVEESTABLISHED(tp->t_state)) {
3399                     if ((thflags & (TH_SYN | TH_ACK)) == TH_SYN) {
3400                               tp->t_flags |= TF_KEEPALIVE;
3401                               tcp_callout_reset(tp, tp->tt_keep, hz / 2,
3402                                                     tcp_timer_keep);
3403                     } else {
3404                               tp->t_rcvtime = ticks;
3405                               tp->t_flags &= ~TF_KEEPALIVE;
3406                               tcp_callout_reset(tp, tp->tt_keep,
3407                                                     tp->t_keepidle,
3408                                                     tcp_timer_keep);
3409                     }
3410           }
3411 }
3412 
3413 static int
tcp_rmx_msl(const struct tcpcb * tp)3414 tcp_rmx_msl(const struct tcpcb *tp)
3415 {
3416           struct rtentry *rt;
3417           struct inpcb *inp = tp->t_inpcb;
3418           int msl;
3419 #ifdef INET6
3420           boolean_t isipv6 = INP_ISIPV6(inp);
3421 #else
3422           const boolean_t isipv6 = FALSE;
3423 #endif
3424 
3425           if (isipv6)
3426                     rt = tcp_rtlookup6(&inp->inp_inc);
3427           else
3428                     rt = tcp_rtlookup(&inp->inp_inc);
3429           if (rt == NULL || rt->rt_rmx.rmx_msl == 0)
3430                     return tcp_msl;
3431 
3432           msl = (rt->rt_rmx.rmx_msl * hz) / 1000;
3433           if (msl == 0)
3434                     msl = 1;
3435 
3436           return msl;
3437 }
3438 
3439 static void
tcp_established(struct tcpcb * tp)3440 tcp_established(struct tcpcb *tp)
3441 {
3442           TCP_STATE_CHANGE(tp, TCPS_ESTABLISHED);
3443           tcp_callout_reset(tp, tp->tt_keep, tp->t_keepidle, tcp_timer_keep);
3444 
3445           if (tp->t_rxtsyn > 0) {
3446                     /*
3447                      * RFC6298:
3448                      * "If the timer expires awaiting the ACK of a SYN segment
3449                      *  and the TCP implementation is using an RTO less than 3
3450                      *  seconds, the RTO MUST be re-initialized to 3 seconds
3451                      *  when data transmission begins"
3452                      */
3453                     if (tp->t_rxtcur < TCPTV_RTOBASE3)
3454                               tp->t_rxtcur = TCPTV_RTOBASE3;
3455           }
3456 }
3457 
3458 /*
3459  * Returns TRUE, if the ACK should be dropped
3460  */
3461 static boolean_t
tcp_recv_dupack(struct tcpcb * tp,tcp_seq th_ack,u_int to_flags)3462 tcp_recv_dupack(struct tcpcb *tp, tcp_seq th_ack, u_int to_flags)
3463 {
3464           boolean_t fast_sack_rexmt = TRUE;
3465 
3466           tcpstat.tcps_rcvdupack++;
3467 
3468           /*
3469            * We have outstanding data (other than a window probe),
3470            * this is a completely duplicate ack (ie, window info
3471            * didn't change), the ack is the biggest we've seen and
3472            * we've seen exactly our rexmt threshhold of them, so
3473            * assume a packet has been dropped and retransmit it.
3474            * Kludge snd_nxt & the congestion window so we send only
3475            * this one packet.
3476            */
3477           if (IN_FASTRECOVERY(tp)) {
3478                     if (TCP_DO_SACK(tp)) {
3479                               boolean_t force = FALSE;
3480 
3481                               if (tp->snd_una == tp->rexmt_high &&
3482                                   (to_flags & (TOF_SACK | TOF_SACK_REDUNDANT)) ==
3483                                   TOF_SACK) {
3484                                         /*
3485                                          * New segments got SACKed and
3486                                          * no retransmit yet.
3487                                          */
3488                                         force = TRUE;
3489                               }
3490 
3491                               /* No artifical cwnd inflation. */
3492                               tcp_sack_rexmt(tp, force);
3493                     } else {
3494                               /*
3495                                * Dup acks mean that packets have left
3496                                * the network (they're now cached at the
3497                                * receiver) so bump cwnd by the amount in
3498                                * the receiver to keep a constant cwnd
3499                                * packets in the network.
3500                                */
3501                               tp->snd_cwnd += tp->t_maxseg;
3502                               tcp_output(tp);
3503                     }
3504                     return TRUE;
3505           } else if (SEQ_LT(th_ack, tp->snd_recover)) {
3506                     tp->t_dupacks = 0;
3507                     return FALSE;
3508           } else if (tcp_ignore_redun_dsack && TCP_DO_SACK(tp) &&
3509               (to_flags & (TOF_DSACK | TOF_SACK_REDUNDANT)) ==
3510               (TOF_DSACK | TOF_SACK_REDUNDANT)) {
3511                     /*
3512                      * If the ACK carries DSACK and other SACK blocks
3513                      * carry information that we have already known,
3514                      * don't count this ACK as duplicate ACK.  This
3515                      * prevents spurious early retransmit and fast
3516                      * retransmit.  This also meets the requirement of
3517                      * RFC3042 that new segments should not be sent if
3518                      * the SACK blocks do not contain new information
3519                      * (XXX we actually loosen the requirment that only
3520                      * DSACK is checked here).
3521                      *
3522                      * This kind of ACKs are usually sent after spurious
3523                      * retransmit.
3524                      */
3525                     /* Do nothing; don't change t_dupacks */
3526                     return TRUE;
3527           } else if (tp->t_dupacks == 0 && TCP_DO_NCR(tp)) {
3528                     tcp_ncr_update_rxtthresh(tp);
3529           }
3530 
3531           if (++tp->t_dupacks == tp->t_rxtthresh) {
3532                     tcp_seq old_snd_nxt;
3533                     u_int win;
3534 
3535 fastretransmit:
3536                     if (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) {
3537                               tcp_save_congestion_state(tp);
3538                               tp->rxt_flags |= TRXT_F_FASTREXMT;
3539                     }
3540                     /*
3541                      * We know we're losing at the current window size,
3542                      * so do congestion avoidance: set ssthresh to half
3543                      * the current window and pull our congestion window
3544                      * back to the new ssthresh.
3545                      */
3546                     win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
3547                     if (win < 2)
3548                               win = 2;
3549                     tp->snd_ssthresh = win * tp->t_maxseg;
3550                     ENTER_FASTRECOVERY(tp);
3551                     tp->snd_recover = tp->snd_max;
3552                     tcp_callout_stop(tp, tp->tt_rexmt);
3553                     tp->t_rtttime = 0;
3554                     old_snd_nxt = tp->snd_nxt;
3555                     tp->snd_nxt = th_ack;
3556                     if (TCP_DO_SACK(tp)) {
3557                               uint32_t rxtlen;
3558 
3559                               rxtlen = tcp_sack_first_unsacked_len(tp);
3560                               if (rxtlen > tp->t_maxseg)
3561                                         rxtlen = tp->t_maxseg;
3562                               tp->snd_cwnd = rxtlen;
3563                     } else {
3564                               tp->snd_cwnd = tp->t_maxseg;
3565                     }
3566                     tcp_output(tp);
3567                     ++tcpstat.tcps_sndfastrexmit;
3568                     tp->snd_cwnd = tp->snd_ssthresh;
3569                     tp->rexmt_high = tp->snd_nxt;
3570                     tp->sack_flags &= ~TSACK_F_SACKRESCUED;
3571                     if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
3572                               tp->snd_nxt = old_snd_nxt;
3573                     KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big"));
3574                     if (TCP_DO_SACK(tp)) {
3575                               if (fast_sack_rexmt)
3576                                         tcp_sack_rexmt(tp, FALSE);
3577                     } else {
3578                               tp->snd_cwnd += tp->t_maxseg *
3579                                   (tp->t_dupacks - tp->snd_limited);
3580                     }
3581           } else if ((tcp_do_rfc6675 && TCP_DO_SACK(tp)) || TCP_DO_NCR(tp)) {
3582                     /*
3583                      * The RFC6675 recommends to reduce the byte threshold,
3584                      * and enter fast retransmit if IsLost(snd_una).  However,
3585                      * if we use IsLost(snd_una) based fast retransmit here,
3586                      * segments reordering will cause spurious retransmit.  So
3587                      * we defer the IsLost(snd_una) based fast retransmit until
3588                      * the extended limited transmit can't send any segments and
3589                      * early retransmit can't be done.
3590                      */
3591                     if (tcp_rfc6675_rxt && tcp_do_rfc6675 &&
3592                         tcp_sack_islost(&tp->scb, tp->snd_una))
3593                               goto fastretransmit;
3594 
3595                     if (tcp_do_limitedtransmit || TCP_DO_NCR(tp)) {
3596                               if (!tcp_sack_limitedxmit(tp)) {
3597                                         /* outstanding data */
3598                                         uint32_t ownd = tp->snd_max - tp->snd_una;
3599 
3600                                         if (need_early_retransmit(tp, ownd)) {
3601                                                   ++tcpstat.tcps_sndearlyrexmit;
3602                                                   tp->rxt_flags |= TRXT_F_EARLYREXMT;
3603                                                   goto fastretransmit;
3604                                         } else if (tcp_do_rfc6675 &&
3605                                             tcp_sack_islost(&tp->scb, tp->snd_una)) {
3606                                                   fast_sack_rexmt = FALSE;
3607                                                   goto fastretransmit;
3608                                         }
3609                               }
3610                     }
3611           } else if (tcp_do_limitedtransmit) {
3612                     u_long oldcwnd = tp->snd_cwnd;
3613                     tcp_seq oldsndmax = tp->snd_max;
3614                     tcp_seq oldsndnxt = tp->snd_nxt;
3615                     /* outstanding data */
3616                     uint32_t ownd = tp->snd_max - tp->snd_una;
3617                     u_int sent;
3618 
3619                     KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2,
3620                         ("dupacks not 1 or 2"));
3621                     if (tp->t_dupacks == 1)
3622                               tp->snd_limited = 0;
3623                     tp->snd_nxt = tp->snd_max;
3624                     tp->snd_cwnd = ownd +
3625                         (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg;
3626                     tp->t_flags |= TF_XMITNOW;
3627                     tcp_output(tp);
3628 
3629                     if (SEQ_LT(oldsndnxt, oldsndmax)) {
3630                               KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
3631                                   ("snd_una moved in other threads"));
3632                               tp->snd_nxt = oldsndnxt;
3633                     }
3634                     tp->snd_cwnd = oldcwnd;
3635                     sent = tp->snd_max - oldsndmax;
3636                     if (sent > tp->t_maxseg) {
3637                               KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) ||
3638                                   (sent == tp->t_maxseg + 1 &&
3639                                    (tp->t_flags & TF_SENTFIN)),
3640                                   ("sent too much"));
3641                               KASSERT(sent <= tp->t_maxseg * 2,
3642                                   ("sent too many segments"));
3643                               tp->snd_limited = 2;
3644                               tcpstat.tcps_sndlimited += 2;
3645                     } else if (sent > 0) {
3646                               ++tp->snd_limited;
3647                               ++tcpstat.tcps_sndlimited;
3648                     } else if (need_early_retransmit(tp, ownd)) {
3649                               ++tcpstat.tcps_sndearlyrexmit;
3650                               tp->rxt_flags |= TRXT_F_EARLYREXMT;
3651                               goto fastretransmit;
3652                     }
3653           }
3654           return TRUE;
3655 }
3656