1 /*        $NetBSD: tcp_output.c,v 1.222 2024/09/08 09:36:52 rillig Exp $        */
2 
3 /*
4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the project nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
34  *
35  * NRL grants permission for redistribution and use in source and binary
36  * forms, with or without modification, of the software and documentation
37  * created at NRL provided that the following conditions are met:
38  *
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgements:
46  *      This product includes software developed by the University of
47  *      California, Berkeley and its contributors.
48  *      This product includes software developed at the Information
49  *      Technology Division, US Naval Research Laboratory.
50  * 4. Neither the name of the NRL nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  *
66  * The views and conclusions contained in the software and documentation
67  * are those of the authors and should not be interpreted as representing
68  * official policies, either expressed or implied, of the US Naval
69  * Research Laboratory (NRL).
70  */
71 
72 /*-
73  * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
74  * All rights reserved.
75  *
76  * This code is derived from software contributed to The NetBSD Foundation
77  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78  * Facility, NASA Ames Research Center.
79  * This code is derived from software contributed to The NetBSD Foundation
80  * by Charles M. Hannum.
81  * This code is derived from software contributed to The NetBSD Foundation
82  * by Rui Paulo.
83  *
84  * Redistribution and use in source and binary forms, with or without
85  * modification, are permitted provided that the following conditions
86  * are met:
87  * 1. Redistributions of source code must retain the above copyright
88  *    notice, this list of conditions and the following disclaimer.
89  * 2. Redistributions in binary form must reproduce the above copyright
90  *    notice, this list of conditions and the following disclaimer in the
91  *    documentation and/or other materials provided with the distribution.
92  *
93  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
94  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
96  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
97  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
98  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
99  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
100  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
101  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
103  * POSSIBILITY OF SUCH DAMAGE.
104  */
105 
106 /*
107  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
108  *        The Regents of the University of California.  All rights reserved.
109  *
110  * Redistribution and use in source and binary forms, with or without
111  * modification, are permitted provided that the following conditions
112  * are met:
113  * 1. Redistributions of source code must retain the above copyright
114  *    notice, this list of conditions and the following disclaimer.
115  * 2. Redistributions in binary form must reproduce the above copyright
116  *    notice, this list of conditions and the following disclaimer in the
117  *    documentation and/or other materials provided with the distribution.
118  * 3. Neither the name of the University nor the names of its contributors
119  *    may be used to endorse or promote products derived from this software
120  *    without specific prior written permission.
121  *
122  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132  * SUCH DAMAGE.
133  *
134  *        @(#)tcp_output.c    8.4 (Berkeley) 5/24/95
135  */
136 
137 #include <sys/cdefs.h>
138 __KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.222 2024/09/08 09:36:52 rillig Exp $");
139 
140 #ifdef _KERNEL_OPT
141 #include "opt_inet.h"
142 #include "opt_ipsec.h"
143 #include "opt_tcp_debug.h"
144 #endif
145 
146 #include <sys/param.h>
147 #include <sys/systm.h>
148 #include <sys/mbuf.h>
149 #include <sys/protosw.h>
150 #include <sys/socket.h>
151 #include <sys/socketvar.h>
152 #include <sys/errno.h>
153 #include <sys/domain.h>
154 #include <sys/kernel.h>
155 #ifdef TCP_SIGNATURE
156 #include <sys/md5.h>
157 #endif
158 
159 #include <net/if.h>
160 #include <net/route.h>
161 
162 #include <netinet/in.h>
163 #include <netinet/in_systm.h>
164 #include <netinet/ip.h>
165 #include <netinet/in_pcb.h>
166 #include <netinet/ip_var.h>
167 
168 #ifdef INET6
169 #include <netinet/ip6.h>
170 #include <netinet6/in6_var.h>
171 #include <netinet6/ip6_var.h>
172 #include <netinet6/in6_pcb.h>
173 #include <netinet6/nd6.h>
174 #endif
175 
176 #ifdef IPSEC
177 #include <netipsec/ipsec.h>
178 #include <netipsec/key.h>
179 #ifdef INET6
180 #include <netipsec/ipsec6.h>
181 #endif
182 #endif
183 
184 #include <netinet/tcp.h>
185 #define   TCPOUTFLAGS
186 #include <netinet/tcp_fsm.h>
187 #include <netinet/tcp_seq.h>
188 #include <netinet/tcp_timer.h>
189 #include <netinet/tcp_var.h>
190 #include <netinet/tcp_private.h>
191 #include <netinet/tcp_congctl.h>
192 #include <netinet/tcp_debug.h>
193 #include <netinet/in_offload.h>
194 #include <netinet6/in6_offload.h>
195 
196 /*
197  * Knob to enable Congestion Window Monitoring, and control
198  * the burst size it allows.  Default burst is 4 packets, per
199  * the Internet draft.
200  */
201 int       tcp_cwm = 0;
202 int       tcp_cwm_burstsize = 4;
203 
204 int       tcp_do_autosndbuf = 1;
205 int       tcp_autosndbuf_inc = 8 * 1024;
206 int       tcp_autosndbuf_max = 256 * 1024;
207 
208 #ifdef TCP_OUTPUT_COUNTERS
209 #include <sys/device.h>
210 
211 extern struct evcnt tcp_output_bigheader;
212 extern struct evcnt tcp_output_predict_hit;
213 extern struct evcnt tcp_output_predict_miss;
214 extern struct evcnt tcp_output_copysmall;
215 extern struct evcnt tcp_output_copybig;
216 extern struct evcnt tcp_output_refbig;
217 
218 #define   TCP_OUTPUT_COUNTER_INCR(ev)   (ev)->ev_count++
219 #else
220 
221 #define   TCP_OUTPUT_COUNTER_INCR(ev)   /* nothing */
222 
223 #endif /* TCP_OUTPUT_COUNTERS */
224 
225 static int
tcp_segsize(struct tcpcb * tp,int * txsegsizep,int * rxsegsizep,bool * alwaysfragp)226 tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep,
227     bool *alwaysfragp)
228 {
229           struct inpcb *inp = tp->t_inpcb;
230           struct socket *so = NULL;
231           struct rtentry *rt;
232           struct ifnet *ifp;
233           int size;
234           int hdrlen;
235           int optlen;
236 
237           *alwaysfragp = false;
238           size = tcp_mssdflt;
239 
240           switch (tp->t_family) {
241           case AF_INET:
242                     hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
243                     break;
244 #ifdef INET6
245           case AF_INET6:
246                     hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
247                     break;
248 #endif
249           default:
250                     hdrlen = 1; /* prevent zero sized segments */
251                     goto out;
252           }
253 
254           rt = inpcb_rtentry(inp);
255           so = inp->inp_socket;
256           if (rt == NULL) {
257                     goto out;
258           }
259 
260           ifp = rt->rt_ifp;
261 
262           if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
263 #ifdef INET6
264                     if (inp->inp_af == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
265                               /*
266                                * RFC2460 section 5, last paragraph: if path MTU is
267                                * smaller than 1280, use 1280 as packet size and
268                                * attach fragment header.
269                                */
270                               size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
271                               *alwaysfragp = true;
272                     } else
273                               size = rt->rt_rmx.rmx_mtu - hdrlen;
274 #else
275                     size = rt->rt_rmx.rmx_mtu - hdrlen;
276 #endif
277           } else if (ifp->if_flags & IFF_LOOPBACK)
278                     size = ifp->if_mtu - hdrlen;
279           else if (inp->inp_af == AF_INET && tp->t_mtudisc)
280                     size = ifp->if_mtu - hdrlen;
281           else if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
282                     size = ifp->if_mtu - hdrlen;
283 #ifdef INET6
284           else if (inp->inp_af == AF_INET6) {
285                     if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
286                               /* mapped addr case */
287                               struct in_addr d;
288                               memcpy(&d, &in6p_faddr(inp).s6_addr32[3], sizeof(d));
289                               if (tp->t_mtudisc || in_localaddr(d))
290                                         size = ifp->if_mtu - hdrlen;
291                     } else {
292                               /*
293                                * for IPv6, path MTU discovery is always turned on,
294                                * or the node must use packet size <= 1280.
295                                */
296                               size = tp->t_mtudisc ? ifp->if_mtu : IPV6_MMTU;
297                               size -= hdrlen;
298                     }
299           }
300 #endif
301           inpcb_rtentry_unref(rt, inp);
302  out:
303           /*
304            * Now we must make room for whatever extra TCP/IP options are in
305            * the packet.
306            */
307           optlen = tcp_optlen(tp);
308 
309           /*
310            * XXX tp->t_ourmss should have the right size, but without this code
311            * fragmentation will occur... need more investigation
312            */
313 
314           if (inp->inp_af == AF_INET) {
315 #if defined(IPSEC)
316                     if (ipsec_used &&
317                         !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
318                               optlen += ipsec4_hdrsiz_tcp(tp);
319 #endif
320                     optlen += ip_optlen(inp);
321           }
322 
323 #ifdef INET6
324           if (inp->inp_af == AF_INET6 && tp->t_family == AF_INET) {
325 #if defined(IPSEC)
326                     if (ipsec_used &&
327                         !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
328                               optlen += ipsec4_hdrsiz_tcp(tp);
329 #endif
330                     /* XXX size -= ip_optlen(in6p); */
331           } else if (inp->inp_af == AF_INET6) {
332 #if defined(IPSEC)
333                     if (ipsec_used &&
334                         !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
335                               optlen += ipsec6_hdrsiz_tcp(tp);
336 #endif
337                     optlen += ip6_optlen(inp);
338           }
339 #endif
340           size -= optlen;
341 
342           /*
343            * There may not be any room for data if mtu is too small. This
344            * includes zero-sized.
345            */
346           if (size <= 0) {
347                     return EMSGSIZE;
348           }
349 
350           /*
351            * *rxsegsizep holds *estimated* inbound segment size (estimation
352            * assumes that path MTU is the same for both ways).  this is only
353            * for silly window avoidance, do not use the value for other purposes.
354            *
355            * ipseclen is subtracted from both sides, this may not be right.
356            * I'm not quite sure about this (could someone comment).
357            */
358           *txsegsizep = uimin(tp->t_peermss - optlen, size);
359           *rxsegsizep = uimin(tp->t_ourmss - optlen, size);
360 
361           /*
362            * Never send more than half a buffer full.  This insures that we can
363            * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
364            * therefore acks will never be delayed unless we run out of data to
365            * transmit.
366            */
367           if (so) {
368                     *txsegsizep = uimin(so->so_snd.sb_hiwat >> 1, *txsegsizep);
369           }
370 
371           /*
372            * A segment must at least store header + options
373            */
374           if (*txsegsizep < hdrlen + optlen) {
375                     return EMSGSIZE;
376           }
377 
378           if (*txsegsizep != tp->t_segsz) {
379                     /*
380                      * If the new segment size is larger, we don't want to
381                      * mess up the congestion window, but if it is smaller
382                      * we'll have to reduce the congestion window to ensure
383                      * that we don't get into trouble with initial windows
384                      * and the rest.  In any case, if the segment size
385                      * has changed, chances are the path has, too, and
386                      * our congestion window will be different.
387                      */
388                     if (*txsegsizep < tp->t_segsz) {
389                               tp->snd_cwnd = uimax((tp->snd_cwnd / tp->t_segsz)
390                                   * *txsegsizep, *txsegsizep);
391                               tp->snd_ssthresh = uimax((tp->snd_ssthresh / tp->t_segsz)
392                                   * *txsegsizep, *txsegsizep);
393                     }
394                     tp->t_segsz = *txsegsizep;
395           }
396 
397           return 0;
398 }
399 
400 static int
tcp_build_datapkt(struct tcpcb * tp,struct socket * so,int off,long len,int hdrlen,struct mbuf ** mp)401 tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
402     long len, int hdrlen, struct mbuf **mp)
403 {
404           struct mbuf *m, *m0;
405           net_stat_ref_t tcps;
406 
407           tcps = TCP_STAT_GETREF();
408           if (tp->t_force && len == 1)
409                     _NET_STATINC_REF(tcps, TCP_STAT_SNDPROBE);
410           else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
411                     tp->t_sndrexmitpack++;
412                     _NET_STATINC_REF(tcps, TCP_STAT_SNDREXMITPACK);
413                     _NET_STATADD_REF(tcps, TCP_STAT_SNDREXMITBYTE, len);
414           } else {
415                     _NET_STATINC_REF(tcps, TCP_STAT_SNDPACK);
416                     _NET_STATADD_REF(tcps, TCP_STAT_SNDBYTE, len);
417           }
418           TCP_STAT_PUTREF();
419 
420           MGETHDR(m, M_DONTWAIT, MT_HEADER);
421           if (__predict_false(m == NULL))
422                     return ENOBUFS;
423           MCLAIM(m, &tcp_tx_mowner);
424 
425           /*
426            * XXX Because other code assumes headers will fit in
427            * XXX one header mbuf.
428            *
429            * (This code should almost *never* be run.)
430            */
431           if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
432                     TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
433                     MCLGET(m, M_DONTWAIT);
434                     if ((m->m_flags & M_EXT) == 0) {
435                               m_freem(m);
436                               return ENOBUFS;
437                     }
438           }
439 
440           m->m_data += max_linkhdr;
441           m->m_len = hdrlen;
442 
443           /*
444            * To avoid traversing the whole sb_mb chain for correct
445            * data to send, remember last sent mbuf, its offset and
446            * the sent size.  When called the next time, see if the
447            * data to send is directly following the previous transfer.
448            * This is important for large TCP windows.
449            */
450           if (off == 0 || tp->t_lastm == NULL ||
451               (tp->t_lastoff + tp->t_lastlen) != off) {
452                     TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
453                     /*
454                      * Either a new packet or a retransmit.
455                      * Start from the beginning.
456                      */
457                     tp->t_lastm = so->so_snd.sb_mb;
458                     tp->t_inoff = off;
459           } else {
460                     TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
461                     tp->t_inoff += tp->t_lastlen;
462           }
463 
464           /* Traverse forward to next packet */
465           while (tp->t_inoff > 0) {
466                     if (tp->t_lastm == NULL)
467                               panic("tp->t_lastm == NULL");
468                     if (tp->t_inoff < tp->t_lastm->m_len)
469                               break;
470                     tp->t_inoff -= tp->t_lastm->m_len;
471                     tp->t_lastm = tp->t_lastm->m_next;
472           }
473 
474           tp->t_lastoff = off;
475           tp->t_lastlen = len;
476           m0 = tp->t_lastm;
477           off = tp->t_inoff;
478 
479           if (len <= M_TRAILINGSPACE(m)) {
480                     m_copydata(m0, off, (int)len, mtod(m, char *) + hdrlen);
481                     m->m_len += len;
482                     TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
483           } else {
484                     m->m_next = m_copym(m0, off, (int)len, M_DONTWAIT);
485                     if (m->m_next == NULL) {
486                               m_freem(m);
487                               return ENOBUFS;
488                     }
489 #ifdef TCP_OUTPUT_COUNTERS
490                     if (m->m_next->m_flags & M_EXT)
491                               TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
492                     else
493                               TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
494 #endif
495           }
496 
497           *mp = m;
498           return 0;
499 }
500 
501 /*
502  * Tcp output routine: figure out what should be sent and send it.
503  */
504 int
tcp_output(struct tcpcb * tp)505 tcp_output(struct tcpcb *tp)
506 {
507           struct rtentry *rt = NULL;
508           struct socket *so;
509           struct route *ro;
510           long len, win;
511           int off, flags, error;
512           struct mbuf *m;
513           struct ip *ip;
514 #ifdef INET6
515           struct ip6_hdr *ip6;
516 #endif
517           struct tcphdr *th;
518           u_char opt[MAX_TCPOPTLEN], *optp;
519 #define OPT_FITS(more)        ((optlen + (more)) <= sizeof(opt))
520           unsigned optlen, hdrlen, packetlen;
521           unsigned int sack_numblks;
522           int idle, sendalot, txsegsize, rxsegsize;
523           int txsegsize_nosack;
524           int maxburst = TCP_MAXBURST;
525           int af;             /* address family on the wire */
526           int iphdrlen;
527           int has_tso4, has_tso6;
528           int has_tso, use_tso;
529           bool alwaysfrag;
530           int sack_rxmit;
531           int sack_bytes_rxmt;
532           int ecn_tos;
533           struct sackhole *p;
534 #ifdef TCP_SIGNATURE
535           int sigoff = 0;
536 #endif
537           net_stat_ref_t tcps;
538 
539           so = tp->t_inpcb->inp_socket;
540           ro = &tp->t_inpcb->inp_route;
541 
542           switch (af = tp->t_family) {
543           case AF_INET:
544           case AF_INET6:
545                     if (tp->t_inpcb)
546                               break;
547                     return EINVAL;
548           default:
549                     return EAFNOSUPPORT;
550           }
551 
552           if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
553                     return EMSGSIZE;
554 
555           idle = (tp->snd_max == tp->snd_una);
556 
557           /*
558            * Determine if we can use TCP segmentation offload:
559            * - If we're using IPv4
560            * - If there is not an IPsec policy that prevents it
561            * - If the interface can do it
562            */
563           has_tso4 = has_tso6 = false;
564 
565           has_tso4 = tp->t_inpcb->inp_af == AF_INET &&
566 #if defined(IPSEC)
567               (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp,
568               IPSEC_DIR_OUTBOUND)) &&
569 #endif
570               (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL &&
571               (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0;
572           if (rt != NULL) {
573                     rtcache_unref(rt, &tp->t_inpcb->inp_route);
574                     rt = NULL;
575           }
576 
577 #if defined(INET6)
578           has_tso6 = tp->t_inpcb->inp_af == AF_INET6 &&
579 #if defined(IPSEC)
580               (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp,
581               IPSEC_DIR_OUTBOUND)) &&
582 #endif
583               (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL &&
584               (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0;
585           if (rt != NULL)
586                     rtcache_unref(rt, &tp->t_inpcb->inp_route);
587 #endif /* defined(INET6) */
588           has_tso = (has_tso4 || has_tso6) && !alwaysfrag;
589 
590           /*
591            * Restart Window computation.  From draft-floyd-incr-init-win-03:
592            *
593            *        Optionally, a TCP MAY set the restart window to the
594            *        minimum of the value used for the initial window and
595            *        the current value of cwnd (in other words, using a
596            *        larger value for the restart window should never increase
597            *        the size of cwnd).
598            */
599           if (tcp_cwm) {
600                     /*
601                      * Hughes/Touch/Heidemann Congestion Window Monitoring.
602                      * Count the number of packets currently pending
603                      * acknowledgement, and limit our congestion window
604                      * to a pre-determined allowed burst size plus that count.
605                      * This prevents bursting once all pending packets have
606                      * been acknowledged (i.e. transmission is idle).
607                      *
608                      * XXX Link this to Initial Window?
609                      */
610                     tp->snd_cwnd = uimin(tp->snd_cwnd,
611                         (tcp_cwm_burstsize * txsegsize) +
612                         (tp->snd_nxt - tp->snd_una));
613           } else {
614                     if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
615                               /*
616                                * We have been idle for "a while" and no acks are
617                                * expected to clock out any data we send --
618                                * slow start to get ack "clock" running again.
619                                */
620                               int ss = tcp_init_win;
621                               if (tp->t_inpcb->inp_af == AF_INET &&
622                                   in_localaddr(in4p_faddr(tp->t_inpcb)))
623                                         ss = tcp_init_win_local;
624 #ifdef INET6
625                               else if (tp->t_inpcb->inp_af == AF_INET6 &&
626                                   in6_localaddr(&in6p_faddr(tp->t_inpcb)))
627                                         ss = tcp_init_win_local;
628 #endif
629                               tp->snd_cwnd = uimin(tp->snd_cwnd,
630                                   TCP_INITIAL_WINDOW(ss, txsegsize));
631                     }
632           }
633 
634           txsegsize_nosack = txsegsize;
635 again:
636           ecn_tos = 0;
637           use_tso = has_tso;
638           if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) {
639                     /* don't duplicate CWR/ECE. */
640                     use_tso = 0;
641           }
642           TCP_REASS_LOCK(tp);
643           sack_numblks = tcp_sack_numblks(tp);
644           if (sack_numblks) {
645                     int sackoptlen;
646 
647                     sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
648                     if (sackoptlen > txsegsize_nosack) {
649                               sack_numblks = 0; /* give up SACK */
650                               txsegsize = txsegsize_nosack;
651                     } else {
652                               if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
653                                         /* don't duplicate D-SACK. */
654                                         use_tso = 0;
655                               }
656                               txsegsize = txsegsize_nosack - sackoptlen;
657                     }
658           } else {
659                     txsegsize = txsegsize_nosack;
660           }
661 
662           /*
663            * Determine length of data that should be transmitted, and
664            * flags that should be used.  If there is some data or critical
665            * controls (SYN, RST) to send, then transmit; otherwise,
666            * investigate further.
667            *
668            * Readjust SACK information to avoid resending duplicate data.
669            */
670           if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
671                     tcp_sack_adjust(tp);
672           sendalot = 0;
673           off = tp->snd_nxt - tp->snd_una;
674           win = uimin(tp->snd_wnd, tp->snd_cwnd);
675 
676           flags = tcp_outflags[tp->t_state];
677 
678           /*
679            * Send any SACK-generated retransmissions.  If we're explicitly trying
680            * to send out new data (when sendalot is 1), bypass this function.
681            * If we retransmit in fast recovery mode, decrement snd_cwnd, since
682            * we're replacing a (future) new transmission with a retransmission
683            * now, and we previously incremented snd_cwnd in tcp_input().
684            */
685           /*
686            * Still in sack recovery, reset rxmit flag to zero.
687            */
688           sack_rxmit = 0;
689           sack_bytes_rxmt = 0;
690           len = 0;
691           p = NULL;
692           do {
693                     long cwin;
694                     if (!TCP_SACK_ENABLED(tp))
695                               break;
696                     if (tp->t_partialacks < 0)
697                               break;
698                     p = tcp_sack_output(tp, &sack_bytes_rxmt);
699                     if (p == NULL)
700                               break;
701 
702                     cwin = uimin(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
703                     if (cwin < 0)
704                               cwin = 0;
705                     /* Do not retransmit SACK segments beyond snd_recover */
706                     if (SEQ_GT(p->end, tp->snd_recover)) {
707                               /*
708                                * (At least) part of sack hole extends beyond
709                                * snd_recover. Check to see if we can rexmit data
710                                * for this hole.
711                                */
712                               if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
713                                         /*
714                                          * Can't rexmit any more data for this hole.
715                                          * That data will be rexmitted in the next
716                                          * sack recovery episode, when snd_recover
717                                          * moves past p->rxmit.
718                                          */
719                                         p = NULL;
720                                         break;
721                               }
722                               /* Can rexmit part of the current hole */
723                               len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
724                     } else
725                               len = ((long)ulmin(cwin, p->end - p->rxmit));
726                     off = p->rxmit - tp->snd_una;
727                     if (off + len > so->so_snd.sb_cc) {
728                               /* 1 for TH_FIN */
729                               KASSERT(off + len == so->so_snd.sb_cc + 1);
730                               KASSERT(p->rxmit + len == tp->snd_max);
731                               len = so->so_snd.sb_cc - off;
732                     }
733                     if (len > 0) {
734                               sack_rxmit = 1;
735                               sendalot = 1;
736                     }
737           } while (/*CONSTCOND*/0);
738 
739           /*
740            * If in persist timeout with window of 0, send 1 byte.
741            * Otherwise, if window is small but nonzero
742            * and timer expired, we will send what we can
743            * and go to transmit state.
744            */
745           if (tp->t_force) {
746                     if (win == 0) {
747                               /*
748                                * If we still have some data to send, then
749                                * clear the FIN bit.  Usually this would
750                                * happen below when it realizes that we
751                                * aren't sending all the data.  However,
752                                * if we have exactly 1 byte of unset data,
753                                * then it won't clear the FIN bit below,
754                                * and if we are in persist state, we wind
755                                * up sending the packet without recording
756                                * that we sent the FIN bit.
757                                *
758                                * We can't just blindly clear the FIN bit,
759                                * because if we don't have any more data
760                                * to send then the probe will be the FIN
761                                * itself.
762                                */
763                               if (off < so->so_snd.sb_cc)
764                                         flags &= ~TH_FIN;
765                               win = 1;
766                     } else {
767                               TCP_TIMER_DISARM(tp, TCPT_PERSIST);
768                               tp->t_rxtshift = 0;
769                     }
770           }
771 
772           if (sack_rxmit == 0) {
773                     if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) {
774                               long cwin;
775 
776                               /*
777                                * We are inside of a SACK recovery episode and are
778                                * sending new data, having retransmitted all the
779                                * data possible in the scoreboard.
780                                */
781                               if (tp->snd_wnd < so->so_snd.sb_cc) {
782                                         len = tp->snd_wnd - off;
783                                         flags &= ~TH_FIN;
784                               } else {
785                                         len = so->so_snd.sb_cc - off;
786                               }
787 
788                               /*
789                                * From FreeBSD:
790                                *  Don't remove this (len > 0) check !
791                                *  We explicitly check for len > 0 here (although it
792                                *  isn't really necessary), to work around a gcc
793                                *  optimization issue - to force gcc to compute
794                                *  len above. Without this check, the computation
795                                *  of len is bungled by the optimizer.
796                                */
797                               if (len > 0) {
798                                         cwin = tp->snd_cwnd -
799                                             (tp->snd_nxt - tp->sack_newdata) -
800                                             sack_bytes_rxmt;
801                                         if (cwin < 0)
802                                                   cwin = 0;
803                                         if (cwin < len) {
804                                                   len = cwin;
805                                                   flags &= ~TH_FIN;
806                                         }
807                               }
808                     } else if (win < so->so_snd.sb_cc) {
809                               len = win - off;
810                               flags &= ~TH_FIN;
811                     } else {
812                               len = so->so_snd.sb_cc - off;
813                     }
814           }
815 
816           if (len < 0) {
817                     /*
818                      * If FIN has been sent but not acked,
819                      * but we haven't been called to retransmit,
820                      * len will be -1.  Otherwise, window shrank
821                      * after we sent into it.  If window shrank to 0,
822                      * cancel pending retransmit, pull snd_nxt back
823                      * to (closed) window, and set the persist timer
824                      * if it isn't already going.  If the window didn't
825                      * close completely, just wait for an ACK.
826                      *
827                      * If we have a pending FIN, either it has already been
828                      * transmitted or it is outside the window, so drop it.
829                      * If the FIN has been transmitted, but this is not a
830                      * retransmission, then len must be -1.  Therefore we also
831                      * prevent here the sending of `gratuitous FINs'.  This
832                      * eliminates the need to check for that case below (e.g.
833                      * to back up snd_nxt before the FIN so that the sequence
834                      * number is correct).
835                      */
836                     len = 0;
837                     flags &= ~TH_FIN;
838                     if (win == 0) {
839                               TCP_TIMER_DISARM(tp, TCPT_REXMT);
840                               tp->t_rxtshift = 0;
841                               tp->snd_nxt = tp->snd_una;
842                               if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
843                                         tcp_setpersist(tp);
844                     }
845           }
846 
847           /*
848            * Automatic sizing enables the performance of large buffers
849            * and most of the efficiency of small ones by only allocating
850            * space when it is needed.
851            *
852            * The criteria to step up the send buffer one notch are:
853            *  1. receive window of remote host is larger than send buffer
854            *     (with a fudge factor of 5/4th);
855            *  2. send buffer is filled to 7/8th with data (so we actually
856            *     have data to make use of it);
857            *  3. send buffer fill has not hit maximal automatic size;
858            *  4. our send window (slow start and cogestion controlled) is
859            *     larger than sent but unacknowledged data in send buffer.
860            *
861            * The remote host receive window scaling factor may limit the
862            * growing of the send buffer before it reaches its allowed
863            * maximum.
864            *
865            * It scales directly with slow start or congestion window
866            * and does at most one step per received ACK.  This fast
867            * scaling has the drawback of growing the send buffer beyond
868            * what is strictly necessary to make full use of a given
869            * delay*bandwidth product.  However testing has shown this not
870            * to be much of a problem.  At worst we are trading wasting
871            * of available bandwidth (the non-use of it) for wasting some
872            * socket buffer memory.
873            *
874            * TODO: Shrink send buffer during idle periods together
875            * with congestion window.  Requires another timer.
876            */
877           if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
878                     if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
879                         so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
880                         so->so_snd.sb_cc < tcp_autosndbuf_max &&
881                         win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
882                               if (!sbreserve(&so->so_snd,
883                                   uimin(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
884                                    tcp_autosndbuf_max), so))
885                                         so->so_snd.sb_flags &= ~SB_AUTOSIZE;
886                     }
887           }
888 
889           if (len > txsegsize) {
890                     if (use_tso) {
891                               /*
892                                * Truncate TSO transfers to IP_MAXPACKET, and make
893                                * sure that we send equal size transfers down the
894                                * stack (rather than big-small-big-small-...).
895                                */
896 #ifdef INET6
897                               CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET);
898 #endif
899                               len = (uimin(len, IP_MAXPACKET) / txsegsize) * txsegsize;
900                               if (len <= txsegsize) {
901                                         use_tso = 0;
902                               }
903                     } else
904                               len = txsegsize;
905                     flags &= ~TH_FIN;
906                     sendalot = 1;
907           } else
908                     use_tso = 0;
909           if (sack_rxmit) {
910                     if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
911                               flags &= ~TH_FIN;
912           }
913 
914           win = sbspace(&so->so_rcv);
915 
916           /*
917            * Sender silly window avoidance.  If connection is idle
918            * and can send all data, a maximum segment,
919            * at least a maximum default-size segment do it,
920            * or are forced, do it; otherwise don't bother.
921            * If peer's buffer is tiny, then send
922            * when window is at least half open.
923            * If retransmitting (possibly after persist timer forced us
924            * to send into a small window), then must resend.
925            */
926           if (len) {
927                     if (len >= txsegsize)
928                               goto send;
929                     if ((so->so_state & SS_MORETOCOME) == 0 &&
930                         ((idle || tp->t_flags & TF_NODELAY) &&
931                          len + off >= so->so_snd.sb_cc))
932                               goto send;
933                     if (tp->t_force)
934                               goto send;
935                     if (len >= tp->max_sndwnd / 2)
936                               goto send;
937                     if (SEQ_LT(tp->snd_nxt, tp->snd_max))
938                               goto send;
939                     if (sack_rxmit)
940                               goto send;
941           }
942 
943           /*
944            * Compare available window to amount of window known to peer
945            * (as advertised window less next expected input).  If the
946            * difference is at least twice the size of the largest segment
947            * we expect to receive (i.e. two segments) or at least 50% of
948            * the maximum possible window, then want to send a window update
949            * to peer.
950            */
951           if (win > 0) {
952                     /*
953                      * "adv" is the amount we can increase the window,
954                      * taking into account that we are limited by
955                      * TCP_MAXWIN << tp->rcv_scale.
956                      */
957                     long recwin = uimin(win, (long)TCP_MAXWIN << tp->rcv_scale);
958                     long oldwin, adv;
959 
960                     /*
961                      * rcv_nxt may overtake rcv_adv when we accept a
962                      * zero-window probe.
963                      */
964                     if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
965                               oldwin = tp->rcv_adv - tp->rcv_nxt;
966                     else
967                               oldwin = 0;
968 
969                     /*
970                      * If the new window size ends up being the same as or
971                      * less than the old size when it is scaled, then
972                      * don't force a window update.
973                      */
974                     if (recwin >> tp->rcv_scale <= oldwin >> tp->rcv_scale)
975                               goto dontupdate;
976 
977                     adv = recwin - oldwin;
978                     if (adv >= (long) (2 * rxsegsize))
979                               goto send;
980                     if (2 * adv >= (long) so->so_rcv.sb_hiwat)
981                               goto send;
982           }
983 dontupdate:
984 
985           /*
986            * Send if we owe peer an ACK.
987            */
988           if (tp->t_flags & TF_ACKNOW)
989                     goto send;
990           if (flags & (TH_SYN|TH_FIN|TH_RST))
991                     goto send;
992           if (SEQ_GT(tp->snd_up, tp->snd_una))
993                     goto send;
994           /*
995            * In SACK, it is possible for tcp_output to fail to send a segment
996            * after the retransmission timer has been turned off.  Make sure
997            * that the retransmission timer is set.
998            */
999           if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) &&
1000               !TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
1001               !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
1002                     TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1003                     goto just_return;
1004           }
1005 
1006           /*
1007            * TCP window updates are not reliable, rather a polling protocol
1008            * using ``persist'' packets is used to insure receipt of window
1009            * updates.  The three ``states'' for the output side are:
1010            *        idle                          not doing retransmits or persists
1011            *        persisting                    to move a small or zero window
1012            *        (re)transmitting    and thereby not persisting
1013            *
1014            * tp->t_timer[TCPT_PERSIST]
1015            *        is set when we are in persist state.
1016            * tp->t_force
1017            *        is set when we are called to send a persist packet.
1018            * tp->t_timer[TCPT_REXMT]
1019            *        is set when we are retransmitting
1020            * The output side is idle when both timers are zero.
1021            *
1022            * If send window is too small, there is data to transmit, and no
1023            * retransmit or persist is pending, then go to persist state.
1024            * If nothing happens soon, send when timer expires:
1025            * if window is nonzero, transmit what we can,
1026            * otherwise force out a byte.
1027            */
1028           if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
1029               TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
1030                     tp->t_rxtshift = 0;
1031                     tcp_setpersist(tp);
1032           }
1033 
1034           /*
1035            * No reason to send a segment, just return.
1036            */
1037 just_return:
1038           TCP_REASS_UNLOCK(tp);
1039           return 0;
1040 
1041 send:
1042           /*
1043            * Before ESTABLISHED, force sending of initial options unless TCP set
1044            * not to do any options.
1045            *
1046            * Note: we assume that the IP/TCP header plus TCP options always fit
1047            * in a single mbuf, leaving room for a maximum link header, i.e.:
1048            *     max_linkhdr + IP_header + TCP_header + optlen <= MCLBYTES
1049            */
1050           optlen = 0;
1051           optp = opt;
1052           switch (af) {
1053           case AF_INET:
1054                     iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
1055                     break;
1056 #ifdef INET6
1057           case AF_INET6:
1058                     iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1059                     break;
1060 #endif
1061           default:  /*pacify gcc*/
1062                     iphdrlen = 0;
1063                     break;
1064           }
1065           hdrlen = iphdrlen;
1066           if (flags & TH_SYN) {
1067                     struct rtentry *synrt;
1068 
1069                     synrt = inpcb_rtentry(tp->t_inpcb);
1070                     tp->snd_nxt = tp->iss;
1071                     tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ?
1072                                                                 synrt->rt_ifp : NULL, af);
1073                     inpcb_rtentry_unref(synrt, tp->t_inpcb);
1074                     if ((tp->t_flags & TF_NOOPT) == 0 && OPT_FITS(TCPOLEN_MAXSEG)) {
1075                               *optp++ = TCPOPT_MAXSEG;
1076                               *optp++ = TCPOLEN_MAXSEG;
1077                               *optp++ = (tp->t_ourmss >> 8) & 0xff;
1078                               *optp++ = tp->t_ourmss & 0xff;
1079                               optlen += TCPOLEN_MAXSEG;
1080 
1081                               if ((tp->t_flags & TF_REQ_SCALE) &&
1082                                   ((flags & TH_ACK) == 0 ||
1083                                   (tp->t_flags & TF_RCVD_SCALE)) &&
1084                                   OPT_FITS(TCPOLEN_WINDOW + TCPOLEN_NOP)) {
1085                                         *((uint32_t *)optp) = htonl(
1086                                                   TCPOPT_NOP << 24 |
1087                                                   TCPOPT_WINDOW << 16 |
1088                                                   TCPOLEN_WINDOW << 8 |
1089                                                   tp->request_r_scale);
1090                                         optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
1091                                         optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
1092                               }
1093                               if (tcp_do_sack && OPT_FITS(TCPOLEN_SACK_PERMITTED)) {
1094                                         *optp++ = TCPOPT_SACK_PERMITTED;
1095                                         *optp++ = TCPOLEN_SACK_PERMITTED;
1096                                         optlen += TCPOLEN_SACK_PERMITTED;
1097                               }
1098                     }
1099           }
1100 
1101           /*
1102            * Send a timestamp and echo-reply if this is a SYN and our side
1103            * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1104            * and our peer have sent timestamps in our SYN's.
1105            */
1106           if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1107                (flags & TH_RST) == 0 &&
1108               ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
1109                (tp->t_flags & TF_RCVD_TSTMP))) {
1110                     int alen = 0;
1111                     while (optlen % 4 != 2) {
1112                               optlen += TCPOLEN_NOP;
1113                               *optp++ = TCPOPT_NOP;
1114                               alen++;
1115                     }
1116                     if (OPT_FITS(TCPOLEN_TIMESTAMP)) {
1117                               *optp++ = TCPOPT_TIMESTAMP;
1118                               *optp++ = TCPOLEN_TIMESTAMP;
1119                               uint32_t *lp = (uint32_t *)optp;
1120                               /* Form timestamp option (appendix A of RFC 1323) */
1121                               *lp++ = htonl(TCP_TIMESTAMP(tp));
1122                               *lp   = htonl(tp->ts_recent);
1123                               optp += TCPOLEN_TIMESTAMP - 2;
1124                               optlen += TCPOLEN_TIMESTAMP;
1125 
1126                               /* Set receive buffer autosizing timestamp. */
1127                               if (tp->rfbuf_ts == 0 &&
1128                                   (so->so_rcv.sb_flags & SB_AUTOSIZE))
1129                                         tp->rfbuf_ts = TCP_TIMESTAMP(tp);
1130                     } else {
1131                               optp -= alen;
1132                               optlen -= alen;
1133                     }
1134           }
1135 
1136 #ifdef TCP_SIGNATURE
1137           if (tp->t_flags & TF_SIGNATURE) {
1138                     /*
1139                      * Initialize TCP-MD5 option (RFC2385)
1140                      */
1141                     if (!OPT_FITS(TCPOLEN_SIGNATURE))
1142                               goto reset;
1143 
1144                     *optp++ = TCPOPT_SIGNATURE;
1145                     *optp++ = TCPOLEN_SIGNATURE;
1146                     sigoff = optlen + 2;
1147                     memset(optp, 0, TCP_SIGLEN);
1148                     optlen += TCPOLEN_SIGNATURE;
1149                     optp += TCP_SIGLEN;
1150           }
1151 #endif
1152 
1153           /*
1154            * Tack on the SACK block if it is necessary.
1155            */
1156           if (sack_numblks) {
1157                     int alen = 0;
1158                     int sack_len = sack_numblks * 8;
1159                     while (optlen % 4 != 2) {
1160                               optlen += TCPOLEN_NOP;
1161                               *optp++ = TCPOPT_NOP;
1162                               alen++;
1163                     }
1164                     if (OPT_FITS(sack_len + 2)) {
1165                               struct ipqent *tiqe;
1166                               *optp++ = TCPOPT_SACK;
1167                               *optp++ = sack_len + 2;
1168                               uint32_t *lp = (uint32_t *)optp;
1169                               if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
1170                                         sack_numblks--;
1171                                         *lp++ = htonl(tp->rcv_dsack_block.left);
1172                                         *lp++ = htonl(tp->rcv_dsack_block.right);
1173                                         tp->rcv_sack_flags &= ~TCPSACK_HAVED;
1174                               }
1175                               for (tiqe = TAILQ_FIRST(&tp->timeq);
1176                                   sack_numblks > 0;
1177                                   tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
1178                                         KASSERT(tiqe != NULL);
1179                                         sack_numblks--;
1180                                         *lp++ = htonl(tiqe->ipqe_seq);
1181                                         *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
1182                                             ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
1183                               }
1184                               optlen += sack_len + 2;
1185                               optp += sack_len;
1186                     } else {
1187                               optp -= alen;
1188                               optlen -= alen;
1189                     }
1190           }
1191 
1192           /* Terminate and pad TCP options to a 4 byte boundary. */
1193           if (optlen % 4) {
1194                     if (!OPT_FITS(TCPOLEN_EOL)) {
1195 reset:                        TCP_REASS_UNLOCK(tp);
1196                               error = ECONNABORTED;
1197                               goto out;
1198                     }
1199                     optlen += TCPOLEN_EOL;
1200                     *optp++ = TCPOPT_EOL;
1201           }
1202           /*
1203            * According to RFC 793 (STD0007):
1204            *   "The content of the header beyond the End-of-Option option
1205            *    must be header padding (i.e., zero)."
1206            *   and later: "The padding is composed of zeros."
1207            */
1208           while (optlen % 4) {
1209                     if (!OPT_FITS(TCPOLEN_PAD))
1210                               goto reset;
1211                     optlen += TCPOLEN_PAD;
1212                     *optp++ = TCPOPT_PAD;
1213           }
1214 
1215           TCP_REASS_UNLOCK(tp);
1216 
1217           hdrlen += optlen;
1218 
1219 #ifdef DIAGNOSTIC
1220           if (!use_tso && len > txsegsize)
1221                     panic("tcp data to be sent is larger than segment");
1222           else if (use_tso && len > IP_MAXPACKET)
1223                     panic("tcp data to be sent is larger than max TSO size");
1224           if (max_linkhdr + hdrlen > MCLBYTES)
1225                     panic("tcphdr too big");
1226 #endif
1227 
1228           /*
1229            * Grab a header mbuf, attaching a copy of data to
1230            * be transmitted, and initialize the header from
1231            * the template for sends on this connection.
1232            */
1233           if (len) {
1234                     error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
1235                     if (error)
1236                               goto out;
1237                     /*
1238                      * If we're sending everything we've got, set PUSH.
1239                      * (This will keep happy those implementations which only
1240                      * give data to the user when a buffer fills or
1241                      * a PUSH comes in.)
1242                      */
1243                     if (off + len == so->so_snd.sb_cc)
1244                               flags |= TH_PUSH;
1245           } else {
1246                     tcps = TCP_STAT_GETREF();
1247                     if (tp->t_flags & TF_ACKNOW)
1248                               _NET_STATINC_REF(tcps, TCP_STAT_SNDACKS);
1249                     else if (flags & (TH_SYN|TH_FIN|TH_RST))
1250                               _NET_STATINC_REF(tcps, TCP_STAT_SNDCTRL);
1251                     else if (SEQ_GT(tp->snd_up, tp->snd_una))
1252                               _NET_STATINC_REF(tcps, TCP_STAT_SNDURG);
1253                     else
1254                               _NET_STATINC_REF(tcps, TCP_STAT_SNDWINUP);
1255                     TCP_STAT_PUTREF();
1256 
1257                     MGETHDR(m, M_DONTWAIT, MT_HEADER);
1258                     if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
1259                               MCLGET(m, M_DONTWAIT);
1260                               if ((m->m_flags & M_EXT) == 0) {
1261                                         m_freem(m);
1262                                         m = NULL;
1263                               }
1264                     }
1265                     if (m == NULL) {
1266                               error = ENOBUFS;
1267                               goto out;
1268                     }
1269                     MCLAIM(m, &tcp_tx_mowner);
1270                     m->m_data += max_linkhdr;
1271                     m->m_len = hdrlen;
1272           }
1273           m_reset_rcvif(m);
1274           switch (af) {
1275           case AF_INET:
1276                     ip = mtod(m, struct ip *);
1277 #ifdef INET6
1278                     ip6 = NULL;
1279 #endif
1280                     th = (struct tcphdr *)(ip + 1);
1281                     break;
1282 #ifdef INET6
1283           case AF_INET6:
1284                     ip = NULL;
1285                     ip6 = mtod(m, struct ip6_hdr *);
1286                     th = (struct tcphdr *)(ip6 + 1);
1287                     break;
1288 #endif
1289           default:  /*pacify gcc*/
1290                     ip = NULL;
1291 #ifdef INET6
1292                     ip6 = NULL;
1293 #endif
1294                     th = NULL;
1295                     break;
1296           }
1297           if (tp->t_template == NULL)
1298                     panic("%s: no template", __func__);
1299           if (tp->t_template->m_len < iphdrlen)
1300                     panic("%s: %d < %d", __func__, tp->t_template->m_len, iphdrlen);
1301           bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen);
1302 
1303           /*
1304            * If we are starting a connection, send ECN setup
1305            * SYN packet. If we are on a retransmit, we may
1306            * resend those bits a number of times as per
1307            * RFC 3168.
1308            */
1309           if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
1310                     if (tp->t_flags & TF_SYN_REXMT) {
1311                               if (tp->t_ecn_retries--)
1312                                         flags |= TH_ECE|TH_CWR;
1313                     } else {
1314                               flags |= TH_ECE|TH_CWR;
1315                               tp->t_ecn_retries = tcp_ecn_maxretries;
1316                     }
1317           }
1318 
1319           if (TCP_ECN_ALLOWED(tp)) {
1320                     /*
1321                      * If the peer has ECN, mark data packets
1322                      * ECN capable. Ignore pure ack packets, retransmissions
1323                      * and window probes.
1324                      */
1325                     if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
1326                         !(tp->t_force && len == 1)) {
1327                               ecn_tos = IPTOS_ECN_ECT0;
1328                               TCP_STATINC(TCP_STAT_ECN_ECT);
1329                     }
1330 
1331                     /*
1332                      * Reply with proper ECN notifications.
1333                      */
1334                     if (tp->t_flags & TF_ECN_SND_CWR) {
1335                               flags |= TH_CWR;
1336                               tp->t_flags &= ~TF_ECN_SND_CWR;
1337                     }
1338                     if (tp->t_flags & TF_ECN_SND_ECE) {
1339                               flags |= TH_ECE;
1340                     }
1341           }
1342 
1343           /*
1344            * If we are doing retransmissions, then snd_nxt will
1345            * not reflect the first unsent octet.  For ACK only
1346            * packets, we do not want the sequence number of the
1347            * retransmitted packet, we want the sequence number
1348            * of the next unsent octet.  So, if there is no data
1349            * (and no SYN or FIN), use snd_max instead of snd_nxt
1350            * when filling in ti_seq.  But if we are in persist
1351            * state, snd_max might reflect one byte beyond the
1352            * right edge of the window, so use snd_nxt in that
1353            * case, since we know we aren't doing a retransmission.
1354            * (retransmit and persist are mutually exclusive...)
1355            */
1356           if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
1357                     th->th_seq = htonl(p->rxmit);
1358                     p->rxmit += len;
1359           } else {
1360                     if (len || (flags & (TH_SYN|TH_FIN)) ||
1361                         TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
1362                               th->th_seq = htonl(tp->snd_nxt);
1363                     else
1364                               th->th_seq = htonl(tp->snd_max);
1365           }
1366           th->th_ack = htonl(tp->rcv_nxt);
1367           if (optlen) {
1368                     memcpy(th + 1, opt, optlen);
1369                     th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1370           }
1371           th->th_flags = flags;
1372           /*
1373            * Calculate receive window.  Don't shrink window,
1374            * but avoid silly window syndrome.
1375            */
1376           if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
1377                     win = 0;
1378           if (win > (long)TCP_MAXWIN << tp->rcv_scale)
1379                     win = (long)TCP_MAXWIN << tp->rcv_scale;
1380           if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
1381                     win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
1382           th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
1383           if (th->th_win == 0) {
1384                     tp->t_sndzerowin++;
1385           }
1386           if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1387                     u_int32_t urp = tp->snd_up - tp->snd_nxt;
1388                     if (urp > IP_MAXPACKET)
1389                               urp = IP_MAXPACKET;
1390                     th->th_urp = htons((u_int16_t)urp);
1391                     th->th_flags |= TH_URG;
1392           } else
1393                     /*
1394                      * If no urgent pointer to send, then we pull
1395                      * the urgent pointer to the left edge of the send window
1396                      * so that it doesn't drift into the send window on sequence
1397                      * number wraparound.
1398                      */
1399                     tp->snd_up = tp->snd_una;               /* drag it along */
1400 
1401 #ifdef TCP_SIGNATURE
1402           if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
1403                     struct secasvar *sav;
1404                     u_int8_t *sigp;
1405 
1406                     sav = tcp_signature_getsav(m);
1407                     if (sav == NULL) {
1408                               m_freem(m);
1409                               return EPERM;
1410                     }
1411 
1412                     m->m_pkthdr.len = hdrlen + len;
1413                     sigp = (char *)th + sizeof(*th) + sigoff;
1414                     tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp);
1415 
1416                     key_sa_recordxfer(sav, m);
1417                     KEY_SA_UNREF(&sav);
1418           }
1419 #endif
1420 
1421           /*
1422            * Set ourselves up to be checksummed just before the packet
1423            * hits the wire.
1424            */
1425           switch (af) {
1426           case AF_INET:
1427                     m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1428                     if (use_tso) {
1429                               m->m_pkthdr.segsz = txsegsize;
1430                               m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
1431                     } else {
1432                               m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
1433                               if (len + optlen) {
1434                                         /* Fixup the pseudo-header checksum. */
1435                                         /* XXXJRT Not IP Jumbogram safe. */
1436                                         th->th_sum = in_cksum_addword(th->th_sum,
1437                                             htons((u_int16_t) (len + optlen)));
1438                               }
1439                     }
1440                     break;
1441 #ifdef INET6
1442           case AF_INET6:
1443                     m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1444                     if (use_tso) {
1445                               m->m_pkthdr.segsz = txsegsize;
1446                               m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
1447                     } else {
1448                               m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
1449                               if (len + optlen) {
1450                                         /* Fixup the pseudo-header checksum. */
1451                                         /* XXXJRT: Not IPv6 Jumbogram safe. */
1452                                         th->th_sum = in_cksum_addword(th->th_sum,
1453                                             htons((u_int16_t) (len + optlen)));
1454                               }
1455                     }
1456                     break;
1457 #endif
1458           }
1459 
1460           /*
1461            * In transmit state, time the transmission and arrange for
1462            * the retransmit.  In persist state, just set snd_max.
1463            */
1464           if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
1465                     tcp_seq startseq = tp->snd_nxt;
1466 
1467                     /*
1468                      * Advance snd_nxt over sequence space of this segment.
1469                      * There are no states in which we send both a SYN and a FIN,
1470                      * so we collapse the tests for these flags.
1471                      */
1472                     if (flags & (TH_SYN|TH_FIN))
1473                               tp->snd_nxt++;
1474                     if (sack_rxmit)
1475                               goto timer;
1476                     tp->snd_nxt += len;
1477                     if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1478                               tp->snd_max = tp->snd_nxt;
1479                               /*
1480                                * Time this transmission if not a retransmission and
1481                                * not currently timing anything.
1482                                */
1483                               if (tp->t_rtttime == 0) {
1484                                         tp->t_rtttime = tcp_now;
1485                                         tp->t_rtseq = startseq;
1486                                         TCP_STATINC(TCP_STAT_SEGSTIMED);
1487                               }
1488                     }
1489 
1490                     /*
1491                      * Set retransmit timer if not currently set,
1492                      * and not doing an ack or a keep-alive probe.
1493                      * Initial value for retransmit timer is smoothed
1494                      * round-trip time + 2 * round-trip time variance.
1495                      * Initialize shift counter which is used for backoff
1496                      * of retransmit time.
1497                      */
1498 timer:
1499                     if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) {
1500                               if ((sack_rxmit && tp->snd_nxt != tp->snd_max)
1501                                   || tp->snd_nxt != tp->snd_una) {
1502                                         if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
1503                                                   TCP_TIMER_DISARM(tp, TCPT_PERSIST);
1504                                                   tp->t_rxtshift = 0;
1505                                         }
1506                                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1507                               } else if (len == 0 && so->so_snd.sb_cc > 0
1508                                   && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
1509                                         /*
1510                                          * If we are sending a window probe and there's
1511                                          * unacked data in the socket, make sure at
1512                                          * least the persist timer is running.
1513                                          */
1514                                         tp->t_rxtshift = 0;
1515                                         tcp_setpersist(tp);
1516                               }
1517                     }
1518           } else
1519                     if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
1520                               tp->snd_max = tp->snd_nxt + len;
1521 
1522 #ifdef TCP_DEBUG
1523           /*
1524            * Trace.
1525            */
1526           if (so->so_options & SO_DEBUG)
1527                     tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
1528 #endif
1529 
1530           /*
1531            * Fill in IP length and desired time to live and
1532            * send to IP level.  There should be a better way
1533            * to handle ttl and tos; we could keep them in
1534            * the template, but need a way to checksum without them.
1535            */
1536           m->m_pkthdr.len = hdrlen + len;
1537 
1538           switch (af) {
1539           case AF_INET:
1540                     ip->ip_len = htons(m->m_pkthdr.len);
1541                     packetlen = m->m_pkthdr.len;
1542                     if (tp->t_inpcb->inp_af == AF_INET) {
1543                               ip->ip_ttl = in4p_ip(tp->t_inpcb).ip_ttl;
1544                               ip->ip_tos = in4p_ip(tp->t_inpcb).ip_tos | ecn_tos;
1545                     }
1546 #ifdef INET6
1547                     else if (tp->t_inpcb->inp_af == AF_INET6) {
1548                               ip->ip_ttl = in6pcb_selecthlim(tp->t_inpcb, NULL); /*XXX*/
1549                               ip->ip_tos = ecn_tos;         /*XXX*/
1550                     }
1551 #endif
1552                     break;
1553 #ifdef INET6
1554           case AF_INET6:
1555                     packetlen = m->m_pkthdr.len;
1556                     ip6->ip6_nxt = IPPROTO_TCP;
1557                     if (tp->t_family == AF_INET6) {
1558                               /*
1559                                * we separately set hoplimit for every segment, since
1560                                * the user might want to change the value via
1561                                * setsockopt. Also, desired default hop limit might
1562                                * be changed via Neighbor Discovery.
1563                                */
1564                               ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
1565                     }
1566                     ip6->ip6_flow |= htonl(ecn_tos << 20);
1567                     /* ip6->ip6_flow = ??? (from template) */
1568                     /* ip6_plen will be filled in ip6_output(). */
1569                     break;
1570 #endif
1571           default:  /*pacify gcc*/
1572                     packetlen = 0;
1573                     break;
1574           }
1575 
1576           switch (af) {
1577           case AF_INET:
1578               {
1579                     struct mbuf *opts;
1580 
1581                     if (tp->t_inpcb->inp_af == AF_INET)
1582                               opts = tp->t_inpcb->inp_options;
1583                     else
1584                               opts = NULL;
1585                     error = ip_output(m, opts, ro,
1586                               (tp->t_mtudisc ? IP_MTUDISC : 0) |
1587                               (so->so_options & SO_DONTROUTE), NULL, tp->t_inpcb);
1588                     break;
1589               }
1590 #ifdef INET6
1591           case AF_INET6:
1592               {
1593                     struct ip6_pktopts *opts;
1594 
1595                     if (tp->t_inpcb->inp_af == AF_INET6)
1596                               opts = in6p_outputopts(tp->t_inpcb);
1597                     else
1598                               opts = NULL;
1599                     error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE,
1600                               NULL, tp->t_inpcb, NULL);
1601                     break;
1602               }
1603 #endif
1604           default:
1605                     error = EAFNOSUPPORT;
1606                     break;
1607           }
1608           if (error) {
1609 out:
1610                     if (error == ENOBUFS) {
1611                               TCP_STATINC(TCP_STAT_SELFQUENCH);
1612                               tcp_quench(tp->t_inpcb);
1613                               error = 0;
1614                     } else if ((error == EHOSTUNREACH || error == ENETDOWN ||
1615                         error == EHOSTDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) {
1616                               tp->t_softerror = error;
1617                               error = 0;
1618                     }
1619 
1620                     /* Back out the sequence number advance. */
1621                     if (sack_rxmit)
1622                               p->rxmit -= len;
1623 
1624                     /* Restart the delayed ACK timer, if necessary. */
1625                     if (tp->t_flags & TF_DELACK)
1626                               TCP_RESTART_DELACK(tp);
1627 
1628                     return error;
1629           }
1630 
1631           if (packetlen > tp->t_pmtud_mtu_sent)
1632                     tp->t_pmtud_mtu_sent = packetlen;
1633 
1634           tcps = TCP_STAT_GETREF();
1635           _NET_STATINC_REF(tcps, TCP_STAT_SNDTOTAL);
1636           if (tp->t_flags & TF_DELACK)
1637                     _NET_STATINC_REF(tcps, TCP_STAT_DELACK);
1638           TCP_STAT_PUTREF();
1639 
1640           /*
1641            * Data sent (as far as we can tell).
1642            * If this advertises a larger window than any other segment,
1643            * then remember the size of the advertised window.
1644            * Any pending ACK has now been sent.
1645            */
1646           if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1647                     tp->rcv_adv = tp->rcv_nxt + win;
1648           tp->last_ack_sent = tp->rcv_nxt;
1649           tp->t_flags &= ~TF_ACKNOW;
1650           TCP_CLEAR_DELACK(tp);
1651 #ifdef DIAGNOSTIC
1652           if (maxburst < 0)
1653                     printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
1654 #endif
1655           if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
1656                     goto again;
1657           return 0;
1658 }
1659 
1660 void
tcp_setpersist(struct tcpcb * tp)1661 tcp_setpersist(struct tcpcb *tp)
1662 {
1663           int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
1664           int nticks;
1665 
1666           if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
1667                     panic("tcp_output REXMT");
1668           /*
1669            * Start/restart persistance timer.
1670            */
1671           if (t < tp->t_rttmin)
1672                     t = tp->t_rttmin;
1673           TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
1674               TCPTV_PERSMIN, TCPTV_PERSMAX);
1675           TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
1676           if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1677                     tp->t_rxtshift++;
1678 }
1679