1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California.
6 * Copyright (c) 2006-2007 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * All rights reserved.
9 *
10 * Portions of this software were developed by Robert N. M. Watson under
11 * contract to Juniper Networks, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
38 */
39
40 #include <sys/cdefs.h>
41 #include "opt_ddb.h"
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_ipsec.h"
45 #include "opt_kern_tls.h"
46 #include "opt_tcpdebug.h"
47
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/arb.h>
51 #include <sys/limits.h>
52 #include <sys/malloc.h>
53 #include <sys/refcount.h>
54 #include <sys/kernel.h>
55 #include <sys/ktls.h>
56 #include <sys/qmath.h>
57 #include <sys/sysctl.h>
58 #include <sys/mbuf.h>
59 #ifdef INET6
60 #include <sys/domain.h>
61 #endif /* INET6 */
62 #include <sys/socket.h>
63 #include <sys/socketvar.h>
64 #include <sys/protosw.h>
65 #include <sys/proc.h>
66 #include <sys/jail.h>
67 #include <sys/syslog.h>
68 #include <sys/stats.h>
69
70 #ifdef DDB
71 #include <ddb/ddb.h>
72 #endif
73
74 #include <net/if.h>
75 #include <net/if_var.h>
76 #include <net/route.h>
77 #include <net/vnet.h>
78
79 #include <netinet/in.h>
80 #include <netinet/in_kdtrace.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip_var.h>
85 #ifdef INET6
86 #include <netinet/ip6.h>
87 #include <netinet6/in6_pcb.h>
88 #include <netinet6/ip6_var.h>
89 #include <netinet6/scope6_var.h>
90 #endif
91 #include <netinet/tcp.h>
92 #include <netinet/tcp_fsm.h>
93 #include <netinet/tcp_seq.h>
94 #include <netinet/tcp_timer.h>
95 #include <netinet/tcp_var.h>
96 #include <netinet/tcp_log_buf.h>
97 #include <netinet/tcpip.h>
98 #include <netinet/cc/cc.h>
99 #include <netinet/tcp_fastopen.h>
100 #include <netinet/tcp_hpts.h>
101 #ifdef TCPPCAP
102 #include <netinet/tcp_pcap.h>
103 #endif
104 #ifdef TCPDEBUG
105 #include <netinet/tcp_debug.h>
106 #endif
107 #ifdef TCP_OFFLOAD
108 #include <netinet/tcp_offload.h>
109 #endif
110 #include <netipsec/ipsec_support.h>
111
112 #include <vm/vm.h>
113 #include <vm/vm_param.h>
114 #include <vm/pmap.h>
115 #include <vm/vm_extern.h>
116 #include <vm/vm_map.h>
117 #include <vm/vm_page.h>
118
119 /*
120 * TCP protocol interface to socket abstraction.
121 */
122 #ifdef INET
123 static int tcp_connect(struct tcpcb *, struct sockaddr *,
124 struct thread *td);
125 #endif /* INET */
126 #ifdef INET6
127 static int tcp6_connect(struct tcpcb *, struct sockaddr *,
128 struct thread *td);
129 #endif /* INET6 */
130 static void tcp_disconnect(struct tcpcb *);
131 static void tcp_usrclosed(struct tcpcb *);
132 static void tcp_fill_info(const struct tcpcb *, struct tcp_info *);
133
134 static int tcp_pru_options_support(struct tcpcb *tp, int flags);
135
136 #ifdef TCPDEBUG
137 #define TCPDEBUG0 int ostate = 0
138 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0
139 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
140 tcp_trace(TA_USER, ostate, tp, 0, 0, req)
141 #else
142 #define TCPDEBUG0
143 #define TCPDEBUG1()
144 #define TCPDEBUG2(req)
145 #endif
146
147 /*
148 * tcp_require_unique port requires a globally-unique source port for each
149 * outgoing connection. The default is to require the 4-tuple to be unique.
150 */
151 VNET_DEFINE(int, tcp_require_unique_port) = 0;
152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, require_unique_port,
153 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_require_unique_port), 0,
154 "Require globally-unique ephemeral port for outgoing connections");
155 #define V_tcp_require_unique_port VNET(tcp_require_unique_port)
156
157 /*
158 * TCP attaches to socket via pru_attach(), reserving space,
159 * and an internet control block.
160 */
161 static int
tcp_usr_attach(struct socket * so,int proto,struct thread * td)162 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
163 {
164 struct inpcb *inp;
165 struct tcpcb *tp = NULL;
166 int error;
167 TCPDEBUG0;
168
169 inp = sotoinpcb(so);
170 KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
171 TCPDEBUG1();
172
173 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
174 error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
175 if (error)
176 goto out;
177 }
178
179 so->so_rcv.sb_flags |= SB_AUTOSIZE;
180 so->so_snd.sb_flags |= SB_AUTOSIZE;
181 error = in_pcballoc(so, &V_tcbinfo);
182 if (error)
183 goto out;
184 inp = sotoinpcb(so);
185 #ifdef INET6
186 if (inp->inp_vflag & INP_IPV6PROTO) {
187 inp->inp_vflag |= INP_IPV6;
188 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
189 inp->inp_vflag |= INP_IPV4;
190 inp->in6p_hops = -1; /* use kernel default */
191 }
192 else
193 #endif
194 inp->inp_vflag |= INP_IPV4;
195 tp = tcp_newtcpcb(inp);
196 if (tp == NULL) {
197 error = ENOBUFS;
198 in_pcbdetach(inp);
199 in_pcbfree(inp);
200 goto out;
201 }
202 tp->t_state = TCPS_CLOSED;
203 INP_WUNLOCK(inp);
204 TCPSTATES_INC(TCPS_CLOSED);
205 out:
206 TCPDEBUG2(PRU_ATTACH);
207 TCP_PROBE2(debug__user, tp, PRU_ATTACH);
208 return (error);
209 }
210
211 /*
212 * tcp_usr_detach is called when the socket layer loses its final reference
213 * to the socket, be it a file descriptor reference, a reference from TCP,
214 * etc. At this point, there is only one case in which we will keep around
215 * inpcb state: time wait.
216 */
217 static void
tcp_usr_detach(struct socket * so)218 tcp_usr_detach(struct socket *so)
219 {
220 struct inpcb *inp;
221 struct tcpcb *tp;
222
223 inp = sotoinpcb(so);
224 KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
225 INP_WLOCK(inp);
226 KASSERT(so->so_pcb == inp && inp->inp_socket == so,
227 ("%s: socket %p inp %p mismatch", __func__, so, inp));
228
229 tp = intotcpcb(inp);
230
231 if (inp->inp_flags & INP_TIMEWAIT) {
232 /*
233 * There are two cases to handle: one in which the time wait
234 * state is being discarded (INP_DROPPED), and one in which
235 * this connection will remain in timewait. In the former,
236 * it is time to discard all state (except tcptw, which has
237 * already been discarded by the timewait close code, which
238 * should be further up the call stack somewhere). In the
239 * latter case, we detach from the socket, but leave the pcb
240 * present until timewait ends.
241 *
242 * XXXRW: Would it be cleaner to free the tcptw here?
243 *
244 * Astute question indeed, from twtcp perspective there are
245 * four cases to consider:
246 *
247 * #1 tcp_usr_detach is called at tcptw creation time by
248 * tcp_twstart, then do not discard the newly created tcptw
249 * and leave inpcb present until timewait ends
250 * #2 tcp_usr_detach is called at tcptw creation time by
251 * tcp_twstart, but connection is local and tw will be
252 * discarded immediately
253 * #3 tcp_usr_detach is called at timewait end (or reuse) by
254 * tcp_twclose, then the tcptw has already been discarded
255 * (or reused) and inpcb is freed here
256 * #4 tcp_usr_detach is called() after timewait ends (or reuse)
257 * (e.g. by soclose), then tcptw has already been discarded
258 * (or reused) and inpcb is freed here
259 *
260 * In all three cases the tcptw should not be freed here.
261 */
262 if (inp->inp_flags & INP_DROPPED) {
263 in_pcbdetach(inp);
264 if (__predict_true(tp == NULL)) {
265 in_pcbfree(inp);
266 } else {
267 /*
268 * This case should not happen as in TIMEWAIT
269 * state the inp should not be destroyed before
270 * its tcptw. If INVARIANTS is defined, panic.
271 */
272 #ifdef INVARIANTS
273 panic("%s: Panic before an inp double-free: "
274 "INP_TIMEWAIT && INP_DROPPED && tp != NULL"
275 , __func__);
276 #else
277 log(LOG_ERR, "%s: Avoid an inp double-free: "
278 "INP_TIMEWAIT && INP_DROPPED && tp != NULL"
279 , __func__);
280 #endif
281 INP_WUNLOCK(inp);
282 }
283 } else {
284 in_pcbdetach(inp);
285 INP_WUNLOCK(inp);
286 }
287 } else {
288 /*
289 * If the connection is not in timewait, we consider two
290 * two conditions: one in which no further processing is
291 * necessary (dropped || embryonic), and one in which TCP is
292 * not yet done, but no longer requires the socket, so the
293 * pcb will persist for the time being.
294 *
295 * XXXRW: Does the second case still occur?
296 */
297 if (inp->inp_flags & INP_DROPPED ||
298 tp->t_state < TCPS_SYN_SENT) {
299 tcp_discardcb(tp);
300 in_pcbdetach(inp);
301 in_pcbfree(inp);
302 } else {
303 in_pcbdetach(inp);
304 INP_WUNLOCK(inp);
305 }
306 }
307 }
308
309 #ifdef INET
310 /*
311 * Give the socket an address.
312 */
313 static int
tcp_usr_bind(struct socket * so,struct sockaddr * nam,struct thread * td)314 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
315 {
316 int error = 0;
317 struct inpcb *inp;
318 struct tcpcb *tp = NULL;
319 struct sockaddr_in *sinp;
320
321 sinp = (struct sockaddr_in *)nam;
322 if (nam->sa_family != AF_INET) {
323 /*
324 * Preserve compatibility with old programs.
325 */
326 if (nam->sa_family != AF_UNSPEC ||
327 nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
328 sinp->sin_addr.s_addr != INADDR_ANY)
329 return (EAFNOSUPPORT);
330 nam->sa_family = AF_INET;
331 }
332 if (nam->sa_len != sizeof(*sinp))
333 return (EINVAL);
334
335 /*
336 * Must check for multicast addresses and disallow binding
337 * to them.
338 */
339 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
340 return (EAFNOSUPPORT);
341
342 TCPDEBUG0;
343 inp = sotoinpcb(so);
344 KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
345 INP_WLOCK(inp);
346 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
347 error = EINVAL;
348 goto out;
349 }
350 tp = intotcpcb(inp);
351 TCPDEBUG1();
352 INP_HASH_WLOCK(&V_tcbinfo);
353 error = in_pcbbind(inp, nam, td->td_ucred);
354 INP_HASH_WUNLOCK(&V_tcbinfo);
355 out:
356 TCPDEBUG2(PRU_BIND);
357 TCP_PROBE2(debug__user, tp, PRU_BIND);
358 INP_WUNLOCK(inp);
359
360 return (error);
361 }
362 #endif /* INET */
363
364 #ifdef INET6
365 static int
tcp6_usr_bind(struct socket * so,struct sockaddr * nam,struct thread * td)366 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
367 {
368 int error = 0;
369 struct inpcb *inp;
370 struct tcpcb *tp = NULL;
371 struct sockaddr_in6 *sin6;
372 u_char vflagsav;
373
374 sin6 = (struct sockaddr_in6 *)nam;
375 if (nam->sa_family != AF_INET6)
376 return (EAFNOSUPPORT);
377 if (nam->sa_len != sizeof(*sin6))
378 return (EINVAL);
379
380 /*
381 * Must check for multicast addresses and disallow binding
382 * to them.
383 */
384 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
385 return (EAFNOSUPPORT);
386
387 TCPDEBUG0;
388 inp = sotoinpcb(so);
389 KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
390 INP_WLOCK(inp);
391 vflagsav = inp->inp_vflag;
392 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
393 error = EINVAL;
394 goto out;
395 }
396 tp = intotcpcb(inp);
397 TCPDEBUG1();
398 INP_HASH_WLOCK(&V_tcbinfo);
399 inp->inp_vflag &= ~INP_IPV4;
400 inp->inp_vflag |= INP_IPV6;
401 #ifdef INET
402 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
403 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
404 inp->inp_vflag |= INP_IPV4;
405 else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
406 struct sockaddr_in sin;
407
408 in6_sin6_2_sin(&sin, sin6);
409 if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
410 error = EAFNOSUPPORT;
411 INP_HASH_WUNLOCK(&V_tcbinfo);
412 goto out;
413 }
414 inp->inp_vflag |= INP_IPV4;
415 inp->inp_vflag &= ~INP_IPV6;
416 error = in_pcbbind(inp, (struct sockaddr *)&sin,
417 td->td_ucred);
418 INP_HASH_WUNLOCK(&V_tcbinfo);
419 goto out;
420 }
421 }
422 #endif
423 error = in6_pcbbind(inp, nam, td->td_ucred);
424 INP_HASH_WUNLOCK(&V_tcbinfo);
425 out:
426 if (error != 0)
427 inp->inp_vflag = vflagsav;
428 TCPDEBUG2(PRU_BIND);
429 TCP_PROBE2(debug__user, tp, PRU_BIND);
430 INP_WUNLOCK(inp);
431 return (error);
432 }
433 #endif /* INET6 */
434
435 #ifdef INET
436 /*
437 * Prepare to accept connections.
438 */
439 static int
tcp_usr_listen(struct socket * so,int backlog,struct thread * td)440 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
441 {
442 int error = 0;
443 struct inpcb *inp;
444 struct tcpcb *tp = NULL;
445
446 TCPDEBUG0;
447 inp = sotoinpcb(so);
448 KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
449 INP_WLOCK(inp);
450 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
451 error = EINVAL;
452 goto out;
453 }
454 tp = intotcpcb(inp);
455 TCPDEBUG1();
456 SOCK_LOCK(so);
457 error = solisten_proto_check(so);
458 INP_HASH_WLOCK(&V_tcbinfo);
459 if (error == 0 && inp->inp_lport == 0)
460 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
461 INP_HASH_WUNLOCK(&V_tcbinfo);
462 if (error == 0) {
463 tcp_state_change(tp, TCPS_LISTEN);
464 solisten_proto(so, backlog);
465 #ifdef TCP_OFFLOAD
466 if ((so->so_options & SO_NO_OFFLOAD) == 0)
467 tcp_offload_listen_start(tp);
468 #endif
469 }
470 SOCK_UNLOCK(so);
471
472 if (IS_FASTOPEN(tp->t_flags))
473 tp->t_tfo_pending = tcp_fastopen_alloc_counter();
474
475 out:
476 TCPDEBUG2(PRU_LISTEN);
477 TCP_PROBE2(debug__user, tp, PRU_LISTEN);
478 INP_WUNLOCK(inp);
479 return (error);
480 }
481 #endif /* INET */
482
483 #ifdef INET6
484 static int
tcp6_usr_listen(struct socket * so,int backlog,struct thread * td)485 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
486 {
487 int error = 0;
488 struct inpcb *inp;
489 struct tcpcb *tp = NULL;
490 u_char vflagsav;
491
492 TCPDEBUG0;
493 inp = sotoinpcb(so);
494 KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
495 INP_WLOCK(inp);
496 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
497 error = EINVAL;
498 goto out;
499 }
500 vflagsav = inp->inp_vflag;
501 tp = intotcpcb(inp);
502 TCPDEBUG1();
503 SOCK_LOCK(so);
504 error = solisten_proto_check(so);
505 INP_HASH_WLOCK(&V_tcbinfo);
506 if (error == 0 && inp->inp_lport == 0) {
507 inp->inp_vflag &= ~INP_IPV4;
508 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
509 inp->inp_vflag |= INP_IPV4;
510 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
511 }
512 INP_HASH_WUNLOCK(&V_tcbinfo);
513 if (error == 0) {
514 tcp_state_change(tp, TCPS_LISTEN);
515 solisten_proto(so, backlog);
516 #ifdef TCP_OFFLOAD
517 if ((so->so_options & SO_NO_OFFLOAD) == 0)
518 tcp_offload_listen_start(tp);
519 #endif
520 }
521 SOCK_UNLOCK(so);
522
523 if (IS_FASTOPEN(tp->t_flags))
524 tp->t_tfo_pending = tcp_fastopen_alloc_counter();
525
526 if (error != 0)
527 inp->inp_vflag = vflagsav;
528
529 out:
530 TCPDEBUG2(PRU_LISTEN);
531 TCP_PROBE2(debug__user, tp, PRU_LISTEN);
532 INP_WUNLOCK(inp);
533 return (error);
534 }
535 #endif /* INET6 */
536
537 #ifdef INET
538 /*
539 * Initiate connection to peer.
540 * Create a template for use in transmissions on this connection.
541 * Enter SYN_SENT state, and mark socket as connecting.
542 * Start keep-alive timer, and seed output sequence space.
543 * Send initial segment on connection.
544 */
545 static int
tcp_usr_connect(struct socket * so,struct sockaddr * nam,struct thread * td)546 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
547 {
548 struct epoch_tracker et;
549 int error = 0;
550 struct inpcb *inp;
551 struct tcpcb *tp = NULL;
552 struct sockaddr_in *sinp;
553
554 sinp = (struct sockaddr_in *)nam;
555 if (nam->sa_family != AF_INET)
556 return (EAFNOSUPPORT);
557 if (nam->sa_len != sizeof (*sinp))
558 return (EINVAL);
559
560 /*
561 * Must disallow TCP ``connections'' to multicast addresses.
562 */
563 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
564 return (EAFNOSUPPORT);
565 if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST)
566 return (EACCES);
567 if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
568 return (error);
569
570 TCPDEBUG0;
571 inp = sotoinpcb(so);
572 KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
573 INP_WLOCK(inp);
574 if (inp->inp_flags & INP_TIMEWAIT) {
575 error = EADDRINUSE;
576 goto out;
577 }
578 if (inp->inp_flags & INP_DROPPED) {
579 error = ECONNREFUSED;
580 goto out;
581 }
582 tp = intotcpcb(inp);
583 TCPDEBUG1();
584 NET_EPOCH_ENTER(et);
585 if ((error = tcp_connect(tp, nam, td)) != 0)
586 goto out_in_epoch;
587 #ifdef TCP_OFFLOAD
588 if (registered_toedevs > 0 &&
589 (so->so_options & SO_NO_OFFLOAD) == 0 &&
590 (error = tcp_offload_connect(so, nam)) == 0)
591 goto out_in_epoch;
592 #endif
593 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
594 error = tp->t_fb->tfb_tcp_output(tp);
595 out_in_epoch:
596 NET_EPOCH_EXIT(et);
597 out:
598 TCPDEBUG2(PRU_CONNECT);
599 TCP_PROBE2(debug__user, tp, PRU_CONNECT);
600 INP_WUNLOCK(inp);
601 return (error);
602 }
603 #endif /* INET */
604
605 #ifdef INET6
606 static int
tcp6_usr_connect(struct socket * so,struct sockaddr * nam,struct thread * td)607 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
608 {
609 struct epoch_tracker et;
610 int error = 0;
611 struct inpcb *inp;
612 struct tcpcb *tp = NULL;
613 struct sockaddr_in6 *sin6;
614 u_int8_t incflagsav;
615 u_char vflagsav;
616
617 TCPDEBUG0;
618
619 sin6 = (struct sockaddr_in6 *)nam;
620 if (nam->sa_family != AF_INET6)
621 return (EAFNOSUPPORT);
622 if (nam->sa_len != sizeof (*sin6))
623 return (EINVAL);
624
625 /*
626 * Must disallow TCP ``connections'' to multicast addresses.
627 */
628 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
629 return (EAFNOSUPPORT);
630
631 inp = sotoinpcb(so);
632 KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
633 INP_WLOCK(inp);
634 vflagsav = inp->inp_vflag;
635 incflagsav = inp->inp_inc.inc_flags;
636 if (inp->inp_flags & INP_TIMEWAIT) {
637 error = EADDRINUSE;
638 goto out;
639 }
640 if (inp->inp_flags & INP_DROPPED) {
641 error = ECONNREFUSED;
642 goto out;
643 }
644 tp = intotcpcb(inp);
645 TCPDEBUG1();
646 #ifdef INET
647 /*
648 * XXXRW: Some confusion: V4/V6 flags relate to binding, and
649 * therefore probably require the hash lock, which isn't held here.
650 * Is this a significant problem?
651 */
652 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
653 struct sockaddr_in sin;
654
655 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
656 error = EINVAL;
657 goto out;
658 }
659 if ((inp->inp_vflag & INP_IPV4) == 0) {
660 error = EAFNOSUPPORT;
661 goto out;
662 }
663
664 in6_sin6_2_sin(&sin, sin6);
665 if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
666 error = EAFNOSUPPORT;
667 goto out;
668 }
669 if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) {
670 error = EACCES;
671 goto out;
672 }
673 if ((error = prison_remote_ip4(td->td_ucred,
674 &sin.sin_addr)) != 0)
675 goto out;
676 inp->inp_vflag |= INP_IPV4;
677 inp->inp_vflag &= ~INP_IPV6;
678 NET_EPOCH_ENTER(et);
679 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
680 goto out_in_epoch;
681 #ifdef TCP_OFFLOAD
682 if (registered_toedevs > 0 &&
683 (so->so_options & SO_NO_OFFLOAD) == 0 &&
684 (error = tcp_offload_connect(so, nam)) == 0)
685 goto out_in_epoch;
686 #endif
687 error = tp->t_fb->tfb_tcp_output(tp);
688 goto out_in_epoch;
689 } else {
690 if ((inp->inp_vflag & INP_IPV6) == 0) {
691 error = EAFNOSUPPORT;
692 goto out;
693 }
694 }
695 #endif
696 if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0)
697 goto out;
698 inp->inp_vflag &= ~INP_IPV4;
699 inp->inp_vflag |= INP_IPV6;
700 inp->inp_inc.inc_flags |= INC_ISIPV6;
701 NET_EPOCH_ENTER(et);
702 if ((error = tcp6_connect(tp, nam, td)) != 0)
703 goto out_in_epoch;
704 #ifdef TCP_OFFLOAD
705 if (registered_toedevs > 0 &&
706 (so->so_options & SO_NO_OFFLOAD) == 0 &&
707 (error = tcp_offload_connect(so, nam)) == 0)
708 goto out_in_epoch;
709 #endif
710 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
711 error = tp->t_fb->tfb_tcp_output(tp);
712 out_in_epoch:
713 NET_EPOCH_EXIT(et);
714 out:
715 /*
716 * If the implicit bind in the connect call fails, restore
717 * the flags we modified.
718 */
719 if (error != 0 && inp->inp_lport == 0) {
720 inp->inp_vflag = vflagsav;
721 inp->inp_inc.inc_flags = incflagsav;
722 }
723
724 TCPDEBUG2(PRU_CONNECT);
725 TCP_PROBE2(debug__user, tp, PRU_CONNECT);
726 INP_WUNLOCK(inp);
727 return (error);
728 }
729 #endif /* INET6 */
730
731 /*
732 * Initiate disconnect from peer.
733 * If connection never passed embryonic stage, just drop;
734 * else if don't need to let data drain, then can just drop anyways,
735 * else have to begin TCP shutdown process: mark socket disconnecting,
736 * drain unread data, state switch to reflect user close, and
737 * send segment (e.g. FIN) to peer. Socket will be really disconnected
738 * when peer sends FIN and acks ours.
739 *
740 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
741 */
742 static int
tcp_usr_disconnect(struct socket * so)743 tcp_usr_disconnect(struct socket *so)
744 {
745 struct inpcb *inp;
746 struct tcpcb *tp = NULL;
747 struct epoch_tracker et;
748 int error = 0;
749
750 TCPDEBUG0;
751 NET_EPOCH_ENTER(et);
752 inp = sotoinpcb(so);
753 KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
754 INP_WLOCK(inp);
755 if (inp->inp_flags & INP_TIMEWAIT)
756 goto out;
757 if (inp->inp_flags & INP_DROPPED) {
758 error = ECONNRESET;
759 goto out;
760 }
761 tp = intotcpcb(inp);
762 TCPDEBUG1();
763 tcp_disconnect(tp);
764 out:
765 TCPDEBUG2(PRU_DISCONNECT);
766 TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
767 INP_WUNLOCK(inp);
768 NET_EPOCH_EXIT(et);
769 return (error);
770 }
771
772 #ifdef INET
773 /*
774 * Accept a connection. Essentially all the work is done at higher levels;
775 * just return the address of the peer, storing through addr.
776 */
777 static int
tcp_usr_accept(struct socket * so,struct sockaddr ** nam)778 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
779 {
780 int error = 0;
781 struct inpcb *inp = NULL;
782 struct tcpcb *tp = NULL;
783 struct in_addr addr;
784 in_port_t port = 0;
785 TCPDEBUG0;
786
787 if (so->so_state & SS_ISDISCONNECTED)
788 return (ECONNABORTED);
789
790 inp = sotoinpcb(so);
791 KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
792 INP_WLOCK(inp);
793 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
794 error = ECONNABORTED;
795 goto out;
796 }
797 tp = intotcpcb(inp);
798 TCPDEBUG1();
799
800 /*
801 * We inline in_getpeeraddr and COMMON_END here, so that we can
802 * copy the data of interest and defer the malloc until after we
803 * release the lock.
804 */
805 port = inp->inp_fport;
806 addr = inp->inp_faddr;
807
808 out:
809 TCPDEBUG2(PRU_ACCEPT);
810 TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
811 INP_WUNLOCK(inp);
812 if (error == 0)
813 *nam = in_sockaddr(port, &addr);
814 return error;
815 }
816 #endif /* INET */
817
818 #ifdef INET6
819 static int
tcp6_usr_accept(struct socket * so,struct sockaddr ** nam)820 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
821 {
822 struct inpcb *inp = NULL;
823 int error = 0;
824 struct tcpcb *tp = NULL;
825 struct in_addr addr;
826 struct in6_addr addr6;
827 struct epoch_tracker et;
828 in_port_t port = 0;
829 int v4 = 0;
830 TCPDEBUG0;
831
832 if (so->so_state & SS_ISDISCONNECTED)
833 return (ECONNABORTED);
834
835 inp = sotoinpcb(so);
836 KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
837 NET_EPOCH_ENTER(et);
838 INP_WLOCK(inp);
839 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
840 error = ECONNABORTED;
841 goto out;
842 }
843 tp = intotcpcb(inp);
844 TCPDEBUG1();
845
846 /*
847 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
848 * copy the data of interest and defer the malloc until after we
849 * release the lock.
850 */
851 if (inp->inp_vflag & INP_IPV4) {
852 v4 = 1;
853 port = inp->inp_fport;
854 addr = inp->inp_faddr;
855 } else {
856 port = inp->inp_fport;
857 addr6 = inp->in6p_faddr;
858 }
859
860 out:
861 TCPDEBUG2(PRU_ACCEPT);
862 TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
863 INP_WUNLOCK(inp);
864 NET_EPOCH_EXIT(et);
865 if (error == 0) {
866 if (v4)
867 *nam = in6_v4mapsin6_sockaddr(port, &addr);
868 else
869 *nam = in6_sockaddr(port, &addr6);
870 }
871 return error;
872 }
873 #endif /* INET6 */
874
875 /*
876 * Mark the connection as being incapable of further output.
877 */
878 static int
tcp_usr_shutdown(struct socket * so)879 tcp_usr_shutdown(struct socket *so)
880 {
881 int error = 0;
882 struct inpcb *inp;
883 struct tcpcb *tp = NULL;
884 struct epoch_tracker et;
885
886 TCPDEBUG0;
887 NET_EPOCH_ENTER(et);
888 inp = sotoinpcb(so);
889 KASSERT(inp != NULL, ("inp == NULL"));
890 INP_WLOCK(inp);
891 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
892 error = ECONNRESET;
893 goto out;
894 }
895 tp = intotcpcb(inp);
896 TCPDEBUG1();
897 socantsendmore(so);
898 tcp_usrclosed(tp);
899 if (!(inp->inp_flags & INP_DROPPED))
900 error = tp->t_fb->tfb_tcp_output(tp);
901
902 out:
903 TCPDEBUG2(PRU_SHUTDOWN);
904 TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
905 INP_WUNLOCK(inp);
906 NET_EPOCH_EXIT(et);
907
908 return (error);
909 }
910
911 /*
912 * After a receive, possibly send window update to peer.
913 */
914 static int
tcp_usr_rcvd(struct socket * so,int flags)915 tcp_usr_rcvd(struct socket *so, int flags)
916 {
917 struct epoch_tracker et;
918 struct inpcb *inp;
919 struct tcpcb *tp = NULL;
920 int error = 0;
921
922 TCPDEBUG0;
923 inp = sotoinpcb(so);
924 KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
925 INP_WLOCK(inp);
926 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
927 error = ECONNRESET;
928 goto out;
929 }
930 tp = intotcpcb(inp);
931 TCPDEBUG1();
932 /*
933 * For passively-created TFO connections, don't attempt a window
934 * update while still in SYN_RECEIVED as this may trigger an early
935 * SYN|ACK. It is preferable to have the SYN|ACK be sent along with
936 * application response data, or failing that, when the DELACK timer
937 * expires.
938 */
939 if (IS_FASTOPEN(tp->t_flags) &&
940 (tp->t_state == TCPS_SYN_RECEIVED))
941 goto out;
942 NET_EPOCH_ENTER(et);
943 #ifdef TCP_OFFLOAD
944 if (tp->t_flags & TF_TOE)
945 tcp_offload_rcvd(tp);
946 else
947 #endif
948 tp->t_fb->tfb_tcp_output(tp);
949 NET_EPOCH_EXIT(et);
950 out:
951 TCPDEBUG2(PRU_RCVD);
952 TCP_PROBE2(debug__user, tp, PRU_RCVD);
953 INP_WUNLOCK(inp);
954 return (error);
955 }
956
957 /*
958 * Do a send by putting data in output queue and updating urgent
959 * marker if URG set. Possibly send more data. Unlike the other
960 * pru_*() routines, the mbuf chains are our responsibility. We
961 * must either enqueue them or free them. The other pru_* routines
962 * generally are caller-frees.
963 */
964 static int
tcp_usr_send(struct socket * so,int flags,struct mbuf * m,struct sockaddr * nam,struct mbuf * control,struct thread * td)965 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
966 struct sockaddr *nam, struct mbuf *control, struct thread *td)
967 {
968 struct epoch_tracker et;
969 int error = 0;
970 struct inpcb *inp;
971 struct tcpcb *tp = NULL;
972 #ifdef INET
973 #ifdef INET6
974 struct sockaddr_in sin;
975 #endif
976 struct sockaddr_in *sinp;
977 #endif
978 #ifdef INET6
979 int isipv6;
980 #endif
981 u_int8_t incflagsav;
982 u_char vflagsav;
983 bool restoreflags;
984 TCPDEBUG0;
985
986 /*
987 * We require the pcbinfo "read lock" if we will close the socket
988 * as part of this call.
989 */
990 NET_EPOCH_ENTER(et);
991 inp = sotoinpcb(so);
992 KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
993 INP_WLOCK(inp);
994 vflagsav = inp->inp_vflag;
995 incflagsav = inp->inp_inc.inc_flags;
996 restoreflags = false;
997 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
998 if (control)
999 m_freem(control);
1000 error = ECONNRESET;
1001 goto out;
1002 }
1003 if (control != NULL) {
1004 /* TCP doesn't do control messages (rights, creds, etc) */
1005 if (control->m_len) {
1006 m_freem(control);
1007 error = EINVAL;
1008 goto out;
1009 }
1010 m_freem(control); /* empty control, just free it */
1011 control = NULL;
1012 }
1013 tp = intotcpcb(inp);
1014 if ((flags & PRUS_OOB) != 0 &&
1015 (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
1016 goto out;
1017
1018 TCPDEBUG1();
1019 if (nam != NULL && tp->t_state < TCPS_SYN_SENT) {
1020 switch (nam->sa_family) {
1021 #ifdef INET
1022 case AF_INET:
1023 sinp = (struct sockaddr_in *)nam;
1024 if (sinp->sin_len != sizeof(struct sockaddr_in)) {
1025 error = EINVAL;
1026 goto out;
1027 }
1028 if ((inp->inp_vflag & INP_IPV6) != 0) {
1029 error = EAFNOSUPPORT;
1030 goto out;
1031 }
1032 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
1033 error = EAFNOSUPPORT;
1034 goto out;
1035 }
1036 if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
1037 error = EACCES;
1038 goto out;
1039 }
1040 if ((error = prison_remote_ip4(td->td_ucred,
1041 &sinp->sin_addr)))
1042 goto out;
1043 #ifdef INET6
1044 isipv6 = 0;
1045 #endif
1046 break;
1047 #endif /* INET */
1048 #ifdef INET6
1049 case AF_INET6:
1050 {
1051 struct sockaddr_in6 *sin6;
1052
1053 sin6 = (struct sockaddr_in6 *)nam;
1054 if (sin6->sin6_len != sizeof(*sin6)) {
1055 error = EINVAL;
1056 goto out;
1057 }
1058 if ((inp->inp_vflag & INP_IPV6PROTO) == 0) {
1059 error = EAFNOSUPPORT;
1060 goto out;
1061 }
1062 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
1063 error = EAFNOSUPPORT;
1064 goto out;
1065 }
1066 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
1067 #ifdef INET
1068 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
1069 error = EINVAL;
1070 goto out;
1071 }
1072 if ((inp->inp_vflag & INP_IPV4) == 0) {
1073 error = EAFNOSUPPORT;
1074 goto out;
1075 }
1076 restoreflags = true;
1077 inp->inp_vflag &= ~INP_IPV6;
1078 sinp = &sin;
1079 in6_sin6_2_sin(sinp, sin6);
1080 if (IN_MULTICAST(
1081 ntohl(sinp->sin_addr.s_addr))) {
1082 error = EAFNOSUPPORT;
1083 goto out;
1084 }
1085 if ((error = prison_remote_ip4(td->td_ucred,
1086 &sinp->sin_addr)))
1087 goto out;
1088 isipv6 = 0;
1089 #else /* !INET */
1090 error = EAFNOSUPPORT;
1091 goto out;
1092 #endif /* INET */
1093 } else {
1094 if ((inp->inp_vflag & INP_IPV6) == 0) {
1095 error = EAFNOSUPPORT;
1096 goto out;
1097 }
1098 restoreflags = true;
1099 inp->inp_vflag &= ~INP_IPV4;
1100 inp->inp_inc.inc_flags |= INC_ISIPV6;
1101 if ((error = prison_remote_ip6(td->td_ucred,
1102 &sin6->sin6_addr)))
1103 goto out;
1104 isipv6 = 1;
1105 }
1106 break;
1107 }
1108 #endif /* INET6 */
1109 default:
1110 error = EAFNOSUPPORT;
1111 goto out;
1112 }
1113 }
1114 if (!(flags & PRUS_OOB)) {
1115 sbappendstream(&so->so_snd, m, flags);
1116 m = NULL;
1117 if (nam && tp->t_state < TCPS_SYN_SENT) {
1118 /*
1119 * Do implied connect if not yet connected,
1120 * initialize window to default value, and
1121 * initialize maxseg using peer's cached MSS.
1122 */
1123 #ifdef INET6
1124 if (isipv6)
1125 error = tcp6_connect(tp, nam, td);
1126 #endif /* INET6 */
1127 #if defined(INET6) && defined(INET)
1128 else
1129 #endif
1130 #ifdef INET
1131 error = tcp_connect(tp,
1132 (struct sockaddr *)sinp, td);
1133 #endif
1134 /*
1135 * The bind operation in tcp_connect succeeded. We
1136 * no longer want to restore the flags if later
1137 * operations fail.
1138 */
1139 if (error == 0 || inp->inp_lport != 0)
1140 restoreflags = false;
1141
1142 if (error) {
1143 /* m is freed if PRUS_NOTREADY is unset. */
1144 sbflush(&so->so_snd);
1145 goto out;
1146 }
1147 if (IS_FASTOPEN(tp->t_flags))
1148 tcp_fastopen_connect(tp);
1149 else {
1150 tp->snd_wnd = TTCP_CLIENT_SND_WND;
1151 tcp_mss(tp, -1);
1152 }
1153 }
1154 if (flags & PRUS_EOF) {
1155 /*
1156 * Close the send side of the connection after
1157 * the data is sent.
1158 */
1159 socantsendmore(so);
1160 tcp_usrclosed(tp);
1161 }
1162 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1163 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
1164 (tp->t_fbyte_out == 0) &&
1165 (so->so_snd.sb_ccc > 0)) {
1166 tp->t_fbyte_out = ticks;
1167 if (tp->t_fbyte_out == 0)
1168 tp->t_fbyte_out = 1;
1169 if (tp->t_fbyte_out && tp->t_fbyte_in)
1170 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
1171 }
1172 if (!(inp->inp_flags & INP_DROPPED) &&
1173 !(flags & PRUS_NOTREADY)) {
1174 if (flags & PRUS_MORETOCOME)
1175 tp->t_flags |= TF_MORETOCOME;
1176 error = tp->t_fb->tfb_tcp_output(tp);
1177 if (flags & PRUS_MORETOCOME)
1178 tp->t_flags &= ~TF_MORETOCOME;
1179 }
1180 } else {
1181 /*
1182 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
1183 */
1184 SOCKBUF_LOCK(&so->so_snd);
1185 if (sbspace(&so->so_snd) < -512) {
1186 SOCKBUF_UNLOCK(&so->so_snd);
1187 error = ENOBUFS;
1188 goto out;
1189 }
1190 /*
1191 * According to RFC961 (Assigned Protocols),
1192 * the urgent pointer points to the last octet
1193 * of urgent data. We continue, however,
1194 * to consider it to indicate the first octet
1195 * of data past the urgent section.
1196 * Otherwise, snd_up should be one lower.
1197 */
1198 sbappendstream_locked(&so->so_snd, m, flags);
1199 SOCKBUF_UNLOCK(&so->so_snd);
1200 m = NULL;
1201 if (nam && tp->t_state < TCPS_SYN_SENT) {
1202 /*
1203 * Do implied connect if not yet connected,
1204 * initialize window to default value, and
1205 * initialize maxseg using peer's cached MSS.
1206 */
1207
1208 /*
1209 * Not going to contemplate SYN|URG
1210 */
1211 if (IS_FASTOPEN(tp->t_flags))
1212 tp->t_flags &= ~TF_FASTOPEN;
1213 #ifdef INET6
1214 if (isipv6)
1215 error = tcp6_connect(tp, nam, td);
1216 #endif /* INET6 */
1217 #if defined(INET6) && defined(INET)
1218 else
1219 #endif
1220 #ifdef INET
1221 error = tcp_connect(tp,
1222 (struct sockaddr *)sinp, td);
1223 #endif
1224 /*
1225 * The bind operation in tcp_connect succeeded. We
1226 * no longer want to restore the flags if later
1227 * operations fail.
1228 */
1229 if (error == 0 || inp->inp_lport != 0)
1230 restoreflags = false;
1231
1232 if (error != 0) {
1233 /* m is freed if PRUS_NOTREADY is unset. */
1234 sbflush(&so->so_snd);
1235 goto out;
1236 }
1237 tp->snd_wnd = TTCP_CLIENT_SND_WND;
1238 tcp_mss(tp, -1);
1239 }
1240 tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
1241 if ((flags & PRUS_NOTREADY) == 0) {
1242 tp->t_flags |= TF_FORCEDATA;
1243 error = tp->t_fb->tfb_tcp_output(tp);
1244 tp->t_flags &= ~TF_FORCEDATA;
1245 }
1246 }
1247 TCP_LOG_EVENT(tp, NULL,
1248 &inp->inp_socket->so_rcv,
1249 &inp->inp_socket->so_snd,
1250 TCP_LOG_USERSEND, error,
1251 0, NULL, false);
1252
1253 out:
1254 /*
1255 * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is
1256 * responsible for freeing memory.
1257 */
1258 if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1259 m_freem(m);
1260
1261 /*
1262 * If the request was unsuccessful and we changed flags,
1263 * restore the original flags.
1264 */
1265 if (error != 0 && restoreflags) {
1266 inp->inp_vflag = vflagsav;
1267 inp->inp_inc.inc_flags = incflagsav;
1268 }
1269 TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
1270 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1271 TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
1272 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1273 INP_WUNLOCK(inp);
1274 NET_EPOCH_EXIT(et);
1275 return (error);
1276 }
1277
1278 static int
tcp_usr_ready(struct socket * so,struct mbuf * m,int count)1279 tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
1280 {
1281 struct epoch_tracker et;
1282 struct inpcb *inp;
1283 struct tcpcb *tp;
1284 int error;
1285
1286 inp = sotoinpcb(so);
1287 INP_WLOCK(inp);
1288 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1289 INP_WUNLOCK(inp);
1290 mb_free_notready(m, count);
1291 return (ECONNRESET);
1292 }
1293 tp = intotcpcb(inp);
1294
1295 SOCKBUF_LOCK(&so->so_snd);
1296 error = sbready(&so->so_snd, m, count);
1297 SOCKBUF_UNLOCK(&so->so_snd);
1298 if (error == 0) {
1299 NET_EPOCH_ENTER(et);
1300 error = tp->t_fb->tfb_tcp_output(tp);
1301 NET_EPOCH_EXIT(et);
1302 }
1303 INP_WUNLOCK(inp);
1304
1305 return (error);
1306 }
1307
1308 /*
1309 * Abort the TCP. Drop the connection abruptly.
1310 */
1311 static void
tcp_usr_abort(struct socket * so)1312 tcp_usr_abort(struct socket *so)
1313 {
1314 struct inpcb *inp;
1315 struct tcpcb *tp = NULL;
1316 struct epoch_tracker et;
1317 TCPDEBUG0;
1318
1319 inp = sotoinpcb(so);
1320 KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
1321
1322 NET_EPOCH_ENTER(et);
1323 INP_WLOCK(inp);
1324 KASSERT(inp->inp_socket != NULL,
1325 ("tcp_usr_abort: inp_socket == NULL"));
1326
1327 /*
1328 * If we still have full TCP state, and we're not dropped, drop.
1329 */
1330 if (!(inp->inp_flags & INP_TIMEWAIT) &&
1331 !(inp->inp_flags & INP_DROPPED)) {
1332 tp = intotcpcb(inp);
1333 TCPDEBUG1();
1334 tp = tcp_drop(tp, ECONNABORTED);
1335 if (tp == NULL)
1336 goto dropped;
1337 TCPDEBUG2(PRU_ABORT);
1338 TCP_PROBE2(debug__user, tp, PRU_ABORT);
1339 }
1340 if (!(inp->inp_flags & INP_DROPPED)) {
1341 SOCK_LOCK(so);
1342 so->so_state |= SS_PROTOREF;
1343 SOCK_UNLOCK(so);
1344 inp->inp_flags |= INP_SOCKREF;
1345 }
1346 INP_WUNLOCK(inp);
1347 dropped:
1348 NET_EPOCH_EXIT(et);
1349 }
1350
1351 /*
1352 * TCP socket is closed. Start friendly disconnect.
1353 */
1354 static void
tcp_usr_close(struct socket * so)1355 tcp_usr_close(struct socket *so)
1356 {
1357 struct inpcb *inp;
1358 struct tcpcb *tp = NULL;
1359 struct epoch_tracker et;
1360 TCPDEBUG0;
1361
1362 inp = sotoinpcb(so);
1363 KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
1364
1365 NET_EPOCH_ENTER(et);
1366 INP_WLOCK(inp);
1367 KASSERT(inp->inp_socket != NULL,
1368 ("tcp_usr_close: inp_socket == NULL"));
1369
1370 /*
1371 * If we still have full TCP state, and we're not dropped, initiate
1372 * a disconnect.
1373 */
1374 if (!(inp->inp_flags & INP_TIMEWAIT) &&
1375 !(inp->inp_flags & INP_DROPPED)) {
1376 tp = intotcpcb(inp);
1377 TCPDEBUG1();
1378 tcp_disconnect(tp);
1379 TCPDEBUG2(PRU_CLOSE);
1380 TCP_PROBE2(debug__user, tp, PRU_CLOSE);
1381 }
1382 if (!(inp->inp_flags & INP_DROPPED)) {
1383 SOCK_LOCK(so);
1384 so->so_state |= SS_PROTOREF;
1385 SOCK_UNLOCK(so);
1386 inp->inp_flags |= INP_SOCKREF;
1387 }
1388 INP_WUNLOCK(inp);
1389 NET_EPOCH_EXIT(et);
1390 }
1391
1392 static int
tcp_pru_options_support(struct tcpcb * tp,int flags)1393 tcp_pru_options_support(struct tcpcb *tp, int flags)
1394 {
1395 /*
1396 * If the specific TCP stack has a pru_options
1397 * specified then it does not always support
1398 * all the PRU_XX options and we must ask it.
1399 * If the function is not specified then all
1400 * of the PRU_XX options are supported.
1401 */
1402 int ret = 0;
1403
1404 if (tp->t_fb->tfb_pru_options) {
1405 ret = (*tp->t_fb->tfb_pru_options)(tp, flags);
1406 }
1407 return (ret);
1408 }
1409
1410 /*
1411 * Receive out-of-band data.
1412 */
1413 static int
tcp_usr_rcvoob(struct socket * so,struct mbuf * m,int flags)1414 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1415 {
1416 int error = 0;
1417 struct inpcb *inp;
1418 struct tcpcb *tp = NULL;
1419
1420 TCPDEBUG0;
1421 inp = sotoinpcb(so);
1422 KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
1423 INP_WLOCK(inp);
1424 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1425 error = ECONNRESET;
1426 goto out;
1427 }
1428 tp = intotcpcb(inp);
1429 error = tcp_pru_options_support(tp, PRUS_OOB);
1430 if (error) {
1431 goto out;
1432 }
1433 TCPDEBUG1();
1434 if ((so->so_oobmark == 0 &&
1435 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1436 so->so_options & SO_OOBINLINE ||
1437 tp->t_oobflags & TCPOOB_HADDATA) {
1438 error = EINVAL;
1439 goto out;
1440 }
1441 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1442 error = EWOULDBLOCK;
1443 goto out;
1444 }
1445 m->m_len = 1;
1446 *mtod(m, caddr_t) = tp->t_iobc;
1447 if ((flags & MSG_PEEK) == 0)
1448 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1449
1450 out:
1451 TCPDEBUG2(PRU_RCVOOB);
1452 TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
1453 INP_WUNLOCK(inp);
1454 return (error);
1455 }
1456
1457 #ifdef INET
1458 struct pr_usrreqs tcp_usrreqs = {
1459 .pru_abort = tcp_usr_abort,
1460 .pru_accept = tcp_usr_accept,
1461 .pru_attach = tcp_usr_attach,
1462 .pru_bind = tcp_usr_bind,
1463 .pru_connect = tcp_usr_connect,
1464 .pru_control = in_control,
1465 .pru_detach = tcp_usr_detach,
1466 .pru_disconnect = tcp_usr_disconnect,
1467 .pru_listen = tcp_usr_listen,
1468 .pru_peeraddr = in_getpeeraddr,
1469 .pru_rcvd = tcp_usr_rcvd,
1470 .pru_rcvoob = tcp_usr_rcvoob,
1471 .pru_send = tcp_usr_send,
1472 .pru_ready = tcp_usr_ready,
1473 .pru_shutdown = tcp_usr_shutdown,
1474 .pru_sockaddr = in_getsockaddr,
1475 .pru_sosetlabel = in_pcbsosetlabel,
1476 .pru_close = tcp_usr_close,
1477 };
1478 #endif /* INET */
1479
1480 #ifdef INET6
1481 struct pr_usrreqs tcp6_usrreqs = {
1482 .pru_abort = tcp_usr_abort,
1483 .pru_accept = tcp6_usr_accept,
1484 .pru_attach = tcp_usr_attach,
1485 .pru_bind = tcp6_usr_bind,
1486 .pru_connect = tcp6_usr_connect,
1487 .pru_control = in6_control,
1488 .pru_detach = tcp_usr_detach,
1489 .pru_disconnect = tcp_usr_disconnect,
1490 .pru_listen = tcp6_usr_listen,
1491 .pru_peeraddr = in6_mapped_peeraddr,
1492 .pru_rcvd = tcp_usr_rcvd,
1493 .pru_rcvoob = tcp_usr_rcvoob,
1494 .pru_send = tcp_usr_send,
1495 .pru_ready = tcp_usr_ready,
1496 .pru_shutdown = tcp_usr_shutdown,
1497 .pru_sockaddr = in6_mapped_sockaddr,
1498 .pru_sosetlabel = in_pcbsosetlabel,
1499 .pru_close = tcp_usr_close,
1500 };
1501 #endif /* INET6 */
1502
1503 #ifdef INET
1504 /*
1505 * Common subroutine to open a TCP connection to remote host specified
1506 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
1507 * port number if needed. Call in_pcbconnect_setup to do the routing and
1508 * to choose a local host address (interface). If there is an existing
1509 * incarnation of the same connection in TIME-WAIT state and if the remote
1510 * host was sending CC options and if the connection duration was < MSL, then
1511 * truncate the previous TIME-WAIT state and proceed.
1512 * Initialize connection parameters and enter SYN-SENT state.
1513 */
1514 static int
tcp_connect(struct tcpcb * tp,struct sockaddr * nam,struct thread * td)1515 tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
1516 {
1517 struct inpcb *inp = tp->t_inpcb, *oinp;
1518 struct socket *so = inp->inp_socket;
1519 struct in_addr laddr;
1520 u_short lport;
1521 int error;
1522
1523 NET_EPOCH_ASSERT();
1524 INP_WLOCK_ASSERT(inp);
1525 INP_HASH_WLOCK(&V_tcbinfo);
1526
1527 if (V_tcp_require_unique_port && inp->inp_lport == 0) {
1528 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
1529 if (error)
1530 goto out;
1531 }
1532
1533 /*
1534 * Cannot simply call in_pcbconnect, because there might be an
1535 * earlier incarnation of this same connection still in
1536 * TIME_WAIT state, creating an ADDRINUSE error.
1537 */
1538 laddr = inp->inp_laddr;
1539 lport = inp->inp_lport;
1540 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
1541 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
1542 if (error && oinp == NULL)
1543 goto out;
1544 if (oinp) {
1545 error = EADDRINUSE;
1546 goto out;
1547 }
1548 /* Handle initial bind if it hadn't been done in advance. */
1549 if (inp->inp_lport == 0) {
1550 inp->inp_lport = lport;
1551 if (in_pcbinshash(inp) != 0) {
1552 inp->inp_lport = 0;
1553 error = EAGAIN;
1554 goto out;
1555 }
1556 }
1557 inp->inp_laddr = laddr;
1558 in_pcbrehash(inp);
1559 INP_HASH_WUNLOCK(&V_tcbinfo);
1560
1561 /*
1562 * Compute window scaling to request:
1563 * Scale to fit into sweet spot. See tcp_syncache.c.
1564 * XXX: This should move to tcp_output().
1565 */
1566 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1567 (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1568 tp->request_r_scale++;
1569
1570 soisconnecting(so);
1571 TCPSTAT_INC(tcps_connattempt);
1572 tcp_state_change(tp, TCPS_SYN_SENT);
1573 tp->iss = tcp_new_isn(&inp->inp_inc);
1574 if (tp->t_flags & TF_REQ_TSTMP)
1575 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1576 tcp_sendseqinit(tp);
1577
1578 return 0;
1579
1580 out:
1581 INP_HASH_WUNLOCK(&V_tcbinfo);
1582 return (error);
1583 }
1584 #endif /* INET */
1585
1586 #ifdef INET6
1587 static int
tcp6_connect(struct tcpcb * tp,struct sockaddr * nam,struct thread * td)1588 tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
1589 {
1590 struct inpcb *inp = tp->t_inpcb;
1591 int error;
1592
1593 INP_WLOCK_ASSERT(inp);
1594 INP_HASH_WLOCK(&V_tcbinfo);
1595
1596 if (V_tcp_require_unique_port && inp->inp_lport == 0) {
1597 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
1598 if (error)
1599 goto out;
1600 }
1601 error = in6_pcbconnect(inp, nam, td->td_ucred);
1602 if (error != 0)
1603 goto out;
1604 INP_HASH_WUNLOCK(&V_tcbinfo);
1605
1606 /* Compute window scaling to request. */
1607 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1608 (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1609 tp->request_r_scale++;
1610
1611 soisconnecting(inp->inp_socket);
1612 TCPSTAT_INC(tcps_connattempt);
1613 tcp_state_change(tp, TCPS_SYN_SENT);
1614 tp->iss = tcp_new_isn(&inp->inp_inc);
1615 if (tp->t_flags & TF_REQ_TSTMP)
1616 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1617 tcp_sendseqinit(tp);
1618
1619 return 0;
1620
1621 out:
1622 INP_HASH_WUNLOCK(&V_tcbinfo);
1623 return error;
1624 }
1625 #endif /* INET6 */
1626
1627 /*
1628 * Export TCP internal state information via a struct tcp_info, based on the
1629 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently
1630 * (TCP state machine, etc). We export all information using FreeBSD-native
1631 * constants -- for example, the numeric values for tcpi_state will differ
1632 * from Linux.
1633 */
1634 void
tcp_fill_info(const struct tcpcb * tp,struct tcp_info * ti)1635 tcp_fill_info(const struct tcpcb *tp, struct tcp_info *ti)
1636 {
1637
1638 INP_LOCK_ASSERT(tp->t_inpcb);
1639 bzero(ti, sizeof(*ti));
1640
1641 ti->tcpi_state = tp->t_state;
1642 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1643 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1644 if (tp->t_flags & TF_SACK_PERMIT)
1645 ti->tcpi_options |= TCPI_OPT_SACK;
1646 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1647 ti->tcpi_options |= TCPI_OPT_WSCALE;
1648 ti->tcpi_snd_wscale = tp->snd_scale;
1649 ti->tcpi_rcv_wscale = tp->rcv_scale;
1650 }
1651 if (tp->t_flags2 & TF2_ECN_PERMIT)
1652 ti->tcpi_options |= TCPI_OPT_ECN;
1653
1654 ti->tcpi_rto = tp->t_rxtcur * tick;
1655 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
1656 ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
1657 ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
1658
1659 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1660 ti->tcpi_snd_cwnd = tp->snd_cwnd;
1661
1662 /*
1663 * FreeBSD-specific extension fields for tcp_info.
1664 */
1665 ti->tcpi_rcv_space = tp->rcv_wnd;
1666 ti->tcpi_rcv_nxt = tp->rcv_nxt;
1667 ti->tcpi_snd_wnd = tp->snd_wnd;
1668 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
1669 ti->tcpi_snd_nxt = tp->snd_nxt;
1670 ti->tcpi_snd_mss = tp->t_maxseg;
1671 ti->tcpi_rcv_mss = tp->t_maxseg;
1672 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
1673 ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
1674 ti->tcpi_snd_zerowin = tp->t_sndzerowin;
1675 ti->tcpi_snd_una = tp->snd_una;
1676 ti->tcpi_snd_max = tp->snd_max;
1677 ti->tcpi_rcv_numsacks = tp->rcv_numsacks;
1678 ti->tcpi_rcv_adv = tp->rcv_adv;
1679 ti->tcpi_dupacks = tp->t_dupacks;
1680 #ifdef TCP_OFFLOAD
1681 if (tp->t_flags & TF_TOE) {
1682 ti->tcpi_options |= TCPI_OPT_TOE;
1683 tcp_offload_tcp_info(tp, ti);
1684 }
1685 #endif
1686 }
1687
1688 /*
1689 * tcp_ctloutput() must drop the inpcb lock before performing copyin on
1690 * socket option arguments. When it re-acquires the lock after the copy, it
1691 * has to revalidate that the connection is still valid for the socket
1692 * option.
1693 */
1694 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \
1695 INP_WLOCK(inp); \
1696 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \
1697 INP_WUNLOCK(inp); \
1698 cleanup; \
1699 return (ECONNRESET); \
1700 } \
1701 tp = intotcpcb(inp); \
1702 } while(0)
1703 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
1704
1705 int
tcp_ctloutput(struct socket * so,struct sockopt * sopt)1706 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1707 {
1708 int error;
1709 struct inpcb *inp;
1710 struct tcpcb *tp;
1711 struct tcp_function_block *blk;
1712 struct tcp_function_set fsn;
1713
1714 error = 0;
1715 inp = sotoinpcb(so);
1716 KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
1717 if (sopt->sopt_level != IPPROTO_TCP) {
1718 #ifdef INET6
1719 if (inp->inp_vflag & INP_IPV6PROTO) {
1720 error = ip6_ctloutput(so, sopt);
1721 /*
1722 * In case of the IPV6_USE_MIN_MTU socket option,
1723 * the INC_IPV6MINMTU flag to announce a corresponding
1724 * MSS during the initial handshake.
1725 * If the TCP connection is not in the front states,
1726 * just reduce the MSS being used.
1727 * This avoids the sending of TCP segments which will
1728 * be fragmented at the IPv6 layer.
1729 */
1730 if ((error == 0) &&
1731 (sopt->sopt_dir == SOPT_SET) &&
1732 (sopt->sopt_level == IPPROTO_IPV6) &&
1733 (sopt->sopt_name == IPV6_USE_MIN_MTU)) {
1734 INP_WLOCK(inp);
1735 if ((inp->inp_flags &
1736 (INP_TIMEWAIT | INP_DROPPED))) {
1737 INP_WUNLOCK(inp);
1738 return (ECONNRESET);
1739 }
1740 inp->inp_inc.inc_flags |= INC_IPV6MINMTU;
1741 tp = intotcpcb(inp);
1742 if ((tp->t_state >= TCPS_SYN_SENT) &&
1743 (inp->inp_inc.inc_flags & INC_ISIPV6)) {
1744 struct ip6_pktopts *opt;
1745
1746 opt = inp->in6p_outputopts;
1747 if ((opt != NULL) &&
1748 (opt->ip6po_minmtu ==
1749 IP6PO_MINMTU_ALL)) {
1750 if (tp->t_maxseg > TCP6_MSS) {
1751 tp->t_maxseg = TCP6_MSS;
1752 }
1753 }
1754 }
1755 INP_WUNLOCK(inp);
1756 }
1757 }
1758 #endif /* INET6 */
1759 #if defined(INET6) && defined(INET)
1760 else
1761 #endif
1762 #ifdef INET
1763 {
1764 error = ip_ctloutput(so, sopt);
1765 }
1766 #endif
1767 return (error);
1768 }
1769 INP_WLOCK(inp);
1770 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1771 INP_WUNLOCK(inp);
1772 return (ECONNRESET);
1773 }
1774 tp = intotcpcb(inp);
1775 /*
1776 * Protect the TCP option TCP_FUNCTION_BLK so
1777 * that a sub-function can *never* overwrite this.
1778 */
1779 if ((sopt->sopt_dir == SOPT_SET) &&
1780 (sopt->sopt_name == TCP_FUNCTION_BLK)) {
1781 INP_WUNLOCK(inp);
1782 error = sooptcopyin(sopt, &fsn, sizeof fsn,
1783 sizeof fsn);
1784 if (error)
1785 return (error);
1786 INP_WLOCK_RECHECK(inp);
1787 blk = find_and_ref_tcp_functions(&fsn);
1788 if (blk == NULL) {
1789 INP_WUNLOCK(inp);
1790 return (ENOENT);
1791 }
1792 if (tp->t_fb == blk) {
1793 /* You already have this */
1794 refcount_release(&blk->tfb_refcnt);
1795 INP_WUNLOCK(inp);
1796 return (0);
1797 }
1798 if (tp->t_state != TCPS_CLOSED) {
1799 /*
1800 * The user has advanced the state
1801 * past the initial point, we may not
1802 * be able to switch.
1803 */
1804 if (blk->tfb_tcp_handoff_ok != NULL) {
1805 /*
1806 * Does the stack provide a
1807 * query mechanism, if so it may
1808 * still be possible?
1809 */
1810 error = (*blk->tfb_tcp_handoff_ok)(tp);
1811 } else
1812 error = EINVAL;
1813 if (error) {
1814 refcount_release(&blk->tfb_refcnt);
1815 INP_WUNLOCK(inp);
1816 return(error);
1817 }
1818 }
1819 if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
1820 refcount_release(&blk->tfb_refcnt);
1821 INP_WUNLOCK(inp);
1822 return (ENOENT);
1823 }
1824 /*
1825 * Release the old refcnt, the
1826 * lookup acquired a ref on the
1827 * new one already.
1828 */
1829 if (tp->t_fb->tfb_tcp_fb_fini) {
1830 struct epoch_tracker et;
1831 /*
1832 * Tell the stack to cleanup with 0 i.e.
1833 * the tcb is not going away.
1834 */
1835 NET_EPOCH_ENTER(et);
1836 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
1837 NET_EPOCH_EXIT(et);
1838 }
1839 #ifdef TCPHPTS
1840 /* Assure that we are not on any hpts */
1841 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL);
1842 #endif
1843 if (blk->tfb_tcp_fb_init) {
1844 error = (*blk->tfb_tcp_fb_init)(tp);
1845 if (error) {
1846 refcount_release(&blk->tfb_refcnt);
1847 if (tp->t_fb->tfb_tcp_fb_init) {
1848 if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
1849 /* Fall back failed, drop the connection */
1850 INP_WUNLOCK(inp);
1851 soabort(so);
1852 return(error);
1853 }
1854 }
1855 goto err_out;
1856 }
1857 }
1858 refcount_release(&tp->t_fb->tfb_refcnt);
1859 tp->t_fb = blk;
1860 #ifdef TCP_OFFLOAD
1861 if (tp->t_flags & TF_TOE) {
1862 tcp_offload_ctloutput(tp, sopt->sopt_dir,
1863 sopt->sopt_name);
1864 }
1865 #endif
1866 err_out:
1867 INP_WUNLOCK(inp);
1868 return (error);
1869 } else if ((sopt->sopt_dir == SOPT_GET) &&
1870 (sopt->sopt_name == TCP_FUNCTION_BLK)) {
1871 strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name,
1872 TCP_FUNCTION_NAME_LEN_MAX);
1873 fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
1874 fsn.pcbcnt = tp->t_fb->tfb_refcnt;
1875 INP_WUNLOCK(inp);
1876 error = sooptcopyout(sopt, &fsn, sizeof fsn);
1877 return (error);
1878 }
1879 /* Pass in the INP locked, called must unlock it */
1880 return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
1881 }
1882
1883 /*
1884 * If this assert becomes untrue, we need to change the size of the buf
1885 * variable in tcp_default_ctloutput().
1886 */
1887 #ifdef CTASSERT
1888 CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
1889 CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
1890 #endif
1891
1892 #ifdef KERN_TLS
1893 static int
copyin_tls_enable(struct sockopt * sopt,struct tls_enable * tls)1894 copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls)
1895 {
1896 struct tls_enable_v0 tls_v0;
1897 int error;
1898
1899 if (sopt->sopt_valsize == sizeof(tls_v0)) {
1900 error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0),
1901 sizeof(tls_v0));
1902 if (error)
1903 return (error);
1904 memset(tls, 0, sizeof(*tls));
1905 tls->cipher_key = tls_v0.cipher_key;
1906 tls->iv = tls_v0.iv;
1907 tls->auth_key = tls_v0.auth_key;
1908 tls->cipher_algorithm = tls_v0.cipher_algorithm;
1909 tls->cipher_key_len = tls_v0.cipher_key_len;
1910 tls->iv_len = tls_v0.iv_len;
1911 tls->auth_algorithm = tls_v0.auth_algorithm;
1912 tls->auth_key_len = tls_v0.auth_key_len;
1913 tls->flags = tls_v0.flags;
1914 tls->tls_vmajor = tls_v0.tls_vmajor;
1915 tls->tls_vminor = tls_v0.tls_vminor;
1916 return (0);
1917 }
1918
1919 return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls)));
1920 }
1921 #endif
1922
1923 int
tcp_default_ctloutput(struct socket * so,struct sockopt * sopt,struct inpcb * inp,struct tcpcb * tp)1924 tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
1925 {
1926 int error, opt, optval;
1927 u_int ui;
1928 struct tcp_info ti;
1929 #ifdef KERN_TLS
1930 struct tls_enable tls;
1931 #endif
1932 struct cc_algo *algo;
1933 char *pbuf, buf[TCP_LOG_ID_LEN];
1934 #ifdef STATS
1935 struct statsblob *sbp;
1936 #endif
1937 size_t len;
1938
1939 /*
1940 * For TCP_CCALGOOPT forward the control to CC module, for both
1941 * SOPT_SET and SOPT_GET.
1942 */
1943 switch (sopt->sopt_name) {
1944 case TCP_CCALGOOPT:
1945 INP_WUNLOCK(inp);
1946 if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT)
1947 return (EINVAL);
1948 pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
1949 error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
1950 sopt->sopt_valsize);
1951 if (error) {
1952 free(pbuf, M_TEMP);
1953 return (error);
1954 }
1955 INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
1956 if (CC_ALGO(tp)->ctl_output != NULL)
1957 error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf);
1958 else
1959 error = ENOENT;
1960 INP_WUNLOCK(inp);
1961 if (error == 0 && sopt->sopt_dir == SOPT_GET)
1962 error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
1963 free(pbuf, M_TEMP);
1964 return (error);
1965 }
1966
1967 switch (sopt->sopt_dir) {
1968 case SOPT_SET:
1969 switch (sopt->sopt_name) {
1970 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1971 case TCP_MD5SIG:
1972 INP_WUNLOCK(inp);
1973 if (!TCPMD5_ENABLED())
1974 return (ENOPROTOOPT);
1975 error = TCPMD5_PCBCTL(inp, sopt);
1976 if (error)
1977 return (error);
1978 INP_WLOCK_RECHECK(inp);
1979 goto unlock_and_done;
1980 #endif /* IPSEC */
1981
1982 case TCP_NODELAY:
1983 case TCP_NOOPT:
1984 INP_WUNLOCK(inp);
1985 error = sooptcopyin(sopt, &optval, sizeof optval,
1986 sizeof optval);
1987 if (error)
1988 return (error);
1989
1990 INP_WLOCK_RECHECK(inp);
1991 switch (sopt->sopt_name) {
1992 case TCP_NODELAY:
1993 opt = TF_NODELAY;
1994 break;
1995 case TCP_NOOPT:
1996 opt = TF_NOOPT;
1997 break;
1998 default:
1999 opt = 0; /* dead code to fool gcc */
2000 break;
2001 }
2002
2003 if (optval)
2004 tp->t_flags |= opt;
2005 else
2006 tp->t_flags &= ~opt;
2007 unlock_and_done:
2008 #ifdef TCP_OFFLOAD
2009 if (tp->t_flags & TF_TOE) {
2010 tcp_offload_ctloutput(tp, sopt->sopt_dir,
2011 sopt->sopt_name);
2012 }
2013 #endif
2014 INP_WUNLOCK(inp);
2015 break;
2016
2017 case TCP_NOPUSH:
2018 INP_WUNLOCK(inp);
2019 error = sooptcopyin(sopt, &optval, sizeof optval,
2020 sizeof optval);
2021 if (error)
2022 return (error);
2023
2024 INP_WLOCK_RECHECK(inp);
2025 if (optval)
2026 tp->t_flags |= TF_NOPUSH;
2027 else if (tp->t_flags & TF_NOPUSH) {
2028 tp->t_flags &= ~TF_NOPUSH;
2029 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2030 struct epoch_tracker et;
2031
2032 NET_EPOCH_ENTER(et);
2033 error = tp->t_fb->tfb_tcp_output(tp);
2034 NET_EPOCH_EXIT(et);
2035 }
2036 }
2037 goto unlock_and_done;
2038
2039 case TCP_REMOTE_UDP_ENCAPS_PORT:
2040 INP_WUNLOCK(inp);
2041 error = sooptcopyin(sopt, &optval, sizeof optval,
2042 sizeof optval);
2043 if (error)
2044 return (error);
2045 if ((optval < TCP_TUNNELING_PORT_MIN) ||
2046 (optval > TCP_TUNNELING_PORT_MAX)) {
2047 /* Its got to be in range */
2048 return (EINVAL);
2049 }
2050 if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) {
2051 /* You have to have enabled a UDP tunneling port first */
2052 return (EINVAL);
2053 }
2054 INP_WLOCK_RECHECK(inp);
2055 if (tp->t_state != TCPS_CLOSED) {
2056 /* You can't change after you are connected */
2057 error = EINVAL;
2058 } else {
2059 /* Ok we are all good set the port */
2060 tp->t_port = htons(optval);
2061 }
2062 goto unlock_and_done;
2063
2064 case TCP_MAXSEG:
2065 INP_WUNLOCK(inp);
2066 error = sooptcopyin(sopt, &optval, sizeof optval,
2067 sizeof optval);
2068 if (error)
2069 return (error);
2070
2071 INP_WLOCK_RECHECK(inp);
2072 if (optval > 0 && optval <= tp->t_maxseg &&
2073 optval + 40 >= V_tcp_minmss)
2074 tp->t_maxseg = optval;
2075 else
2076 error = EINVAL;
2077 goto unlock_and_done;
2078
2079 case TCP_INFO:
2080 INP_WUNLOCK(inp);
2081 error = EINVAL;
2082 break;
2083
2084 case TCP_STATS:
2085 INP_WUNLOCK(inp);
2086 #ifdef STATS
2087 error = sooptcopyin(sopt, &optval, sizeof optval,
2088 sizeof optval);
2089 if (error)
2090 return (error);
2091
2092 if (optval > 0)
2093 sbp = stats_blob_alloc(
2094 V_tcp_perconn_stats_dflt_tpl, 0);
2095 else
2096 sbp = NULL;
2097
2098 INP_WLOCK_RECHECK(inp);
2099 if ((tp->t_stats != NULL && sbp == NULL) ||
2100 (tp->t_stats == NULL && sbp != NULL)) {
2101 struct statsblob *t = tp->t_stats;
2102 tp->t_stats = sbp;
2103 sbp = t;
2104 }
2105 INP_WUNLOCK(inp);
2106
2107 stats_blob_destroy(sbp);
2108 #else
2109 return (EOPNOTSUPP);
2110 #endif /* !STATS */
2111 break;
2112
2113 case TCP_CONGESTION:
2114 INP_WUNLOCK(inp);
2115 error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
2116 if (error)
2117 break;
2118 buf[sopt->sopt_valsize] = '\0';
2119 INP_WLOCK_RECHECK(inp);
2120 CC_LIST_RLOCK();
2121 STAILQ_FOREACH(algo, &cc_list, entries)
2122 if (strncmp(buf, algo->name,
2123 TCP_CA_NAME_MAX) == 0)
2124 break;
2125 CC_LIST_RUNLOCK();
2126 if (algo == NULL) {
2127 INP_WUNLOCK(inp);
2128 error = EINVAL;
2129 break;
2130 }
2131 /*
2132 * We hold a write lock over the tcb so it's safe to
2133 * do these things without ordering concerns.
2134 */
2135 if (CC_ALGO(tp)->cb_destroy != NULL)
2136 CC_ALGO(tp)->cb_destroy(tp->ccv);
2137 CC_DATA(tp) = NULL;
2138 CC_ALGO(tp) = algo;
2139 /*
2140 * If something goes pear shaped initialising the new
2141 * algo, fall back to newreno (which does not
2142 * require initialisation).
2143 */
2144 if (algo->cb_init != NULL &&
2145 algo->cb_init(tp->ccv) != 0) {
2146 CC_ALGO(tp) = &newreno_cc_algo;
2147 /*
2148 * The only reason init should fail is
2149 * because of malloc.
2150 */
2151 error = ENOMEM;
2152 }
2153 INP_WUNLOCK(inp);
2154 break;
2155
2156 case TCP_REUSPORT_LB_NUMA:
2157 INP_WUNLOCK(inp);
2158 error = sooptcopyin(sopt, &optval, sizeof(optval),
2159 sizeof(optval));
2160 INP_WLOCK_RECHECK(inp);
2161 if (!error)
2162 error = in_pcblbgroup_numa(inp, optval);
2163 INP_WUNLOCK(inp);
2164 break;
2165
2166 #ifdef KERN_TLS
2167 case TCP_TXTLS_ENABLE:
2168 INP_WUNLOCK(inp);
2169 error = copyin_tls_enable(sopt, &tls);
2170 if (error)
2171 break;
2172 error = ktls_enable_tx(so, &tls);
2173 break;
2174 case TCP_TXTLS_MODE:
2175 INP_WUNLOCK(inp);
2176 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2177 if (error)
2178 return (error);
2179
2180 INP_WLOCK_RECHECK(inp);
2181 error = ktls_set_tx_mode(so, ui);
2182 INP_WUNLOCK(inp);
2183 break;
2184 case TCP_RXTLS_ENABLE:
2185 INP_WUNLOCK(inp);
2186 error = sooptcopyin(sopt, &tls, sizeof(tls),
2187 sizeof(tls));
2188 if (error)
2189 break;
2190 error = ktls_enable_rx(so, &tls);
2191 break;
2192 #endif
2193
2194 case TCP_KEEPIDLE:
2195 case TCP_KEEPINTVL:
2196 case TCP_KEEPINIT:
2197 INP_WUNLOCK(inp);
2198 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2199 if (error)
2200 return (error);
2201
2202 if (ui > (UINT_MAX / hz)) {
2203 error = EINVAL;
2204 break;
2205 }
2206 ui *= hz;
2207
2208 INP_WLOCK_RECHECK(inp);
2209 switch (sopt->sopt_name) {
2210 case TCP_KEEPIDLE:
2211 tp->t_keepidle = ui;
2212 /*
2213 * XXX: better check current remaining
2214 * timeout and "merge" it with new value.
2215 */
2216 if ((tp->t_state > TCPS_LISTEN) &&
2217 (tp->t_state <= TCPS_CLOSING))
2218 tcp_timer_activate(tp, TT_KEEP,
2219 TP_KEEPIDLE(tp));
2220 break;
2221 case TCP_KEEPINTVL:
2222 tp->t_keepintvl = ui;
2223 if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2224 (TP_MAXIDLE(tp) > 0))
2225 tcp_timer_activate(tp, TT_2MSL,
2226 TP_MAXIDLE(tp));
2227 break;
2228 case TCP_KEEPINIT:
2229 tp->t_keepinit = ui;
2230 if (tp->t_state == TCPS_SYN_RECEIVED ||
2231 tp->t_state == TCPS_SYN_SENT)
2232 tcp_timer_activate(tp, TT_KEEP,
2233 TP_KEEPINIT(tp));
2234 break;
2235 }
2236 goto unlock_and_done;
2237
2238 case TCP_KEEPCNT:
2239 INP_WUNLOCK(inp);
2240 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2241 if (error)
2242 return (error);
2243
2244 INP_WLOCK_RECHECK(inp);
2245 tp->t_keepcnt = ui;
2246 if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2247 (TP_MAXIDLE(tp) > 0))
2248 tcp_timer_activate(tp, TT_2MSL,
2249 TP_MAXIDLE(tp));
2250 goto unlock_and_done;
2251
2252 #ifdef TCPPCAP
2253 case TCP_PCAP_OUT:
2254 case TCP_PCAP_IN:
2255 INP_WUNLOCK(inp);
2256 error = sooptcopyin(sopt, &optval, sizeof optval,
2257 sizeof optval);
2258 if (error)
2259 return (error);
2260
2261 INP_WLOCK_RECHECK(inp);
2262 if (optval >= 0)
2263 tcp_pcap_set_sock_max(
2264 (sopt->sopt_name == TCP_PCAP_OUT) ?
2265 &(tp->t_outpkts) : &(tp->t_inpkts),
2266 optval);
2267 else
2268 error = EINVAL;
2269 goto unlock_and_done;
2270 #endif
2271
2272 case TCP_FASTOPEN: {
2273 struct tcp_fastopen tfo_optval;
2274
2275 INP_WUNLOCK(inp);
2276 if (!V_tcp_fastopen_client_enable &&
2277 !V_tcp_fastopen_server_enable)
2278 return (EPERM);
2279
2280 error = sooptcopyin(sopt, &tfo_optval,
2281 sizeof(tfo_optval), sizeof(int));
2282 if (error)
2283 return (error);
2284
2285 INP_WLOCK_RECHECK(inp);
2286 if ((tp->t_state != TCPS_CLOSED) &&
2287 (tp->t_state != TCPS_LISTEN)) {
2288 error = EINVAL;
2289 goto unlock_and_done;
2290 }
2291 if (tfo_optval.enable) {
2292 if (tp->t_state == TCPS_LISTEN) {
2293 if (!V_tcp_fastopen_server_enable) {
2294 error = EPERM;
2295 goto unlock_and_done;
2296 }
2297
2298 if (tp->t_tfo_pending == NULL)
2299 tp->t_tfo_pending =
2300 tcp_fastopen_alloc_counter();
2301 } else {
2302 /*
2303 * If a pre-shared key was provided,
2304 * stash it in the client cookie
2305 * field of the tcpcb for use during
2306 * connect.
2307 */
2308 if (sopt->sopt_valsize ==
2309 sizeof(tfo_optval)) {
2310 memcpy(tp->t_tfo_cookie.client,
2311 tfo_optval.psk,
2312 TCP_FASTOPEN_PSK_LEN);
2313 tp->t_tfo_client_cookie_len =
2314 TCP_FASTOPEN_PSK_LEN;
2315 }
2316 }
2317 tp->t_flags |= TF_FASTOPEN;
2318 } else
2319 tp->t_flags &= ~TF_FASTOPEN;
2320 goto unlock_and_done;
2321 }
2322
2323 #ifdef TCP_BLACKBOX
2324 case TCP_LOG:
2325 INP_WUNLOCK(inp);
2326 error = sooptcopyin(sopt, &optval, sizeof optval,
2327 sizeof optval);
2328 if (error)
2329 return (error);
2330
2331 INP_WLOCK_RECHECK(inp);
2332 error = tcp_log_state_change(tp, optval);
2333 goto unlock_and_done;
2334
2335 case TCP_LOGBUF:
2336 INP_WUNLOCK(inp);
2337 error = EINVAL;
2338 break;
2339
2340 case TCP_LOGID:
2341 INP_WUNLOCK(inp);
2342 error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
2343 if (error)
2344 break;
2345 buf[sopt->sopt_valsize] = '\0';
2346 INP_WLOCK_RECHECK(inp);
2347 error = tcp_log_set_id(tp, buf);
2348 /* tcp_log_set_id() unlocks the INP. */
2349 break;
2350
2351 case TCP_LOGDUMP:
2352 case TCP_LOGDUMPID:
2353 INP_WUNLOCK(inp);
2354 error =
2355 sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
2356 if (error)
2357 break;
2358 buf[sopt->sopt_valsize] = '\0';
2359 INP_WLOCK_RECHECK(inp);
2360 if (sopt->sopt_name == TCP_LOGDUMP) {
2361 error = tcp_log_dump_tp_logbuf(tp, buf,
2362 M_WAITOK, true);
2363 INP_WUNLOCK(inp);
2364 } else {
2365 tcp_log_dump_tp_bucket_logbufs(tp, buf);
2366 /*
2367 * tcp_log_dump_tp_bucket_logbufs() drops the
2368 * INP lock.
2369 */
2370 }
2371 break;
2372 #endif
2373
2374 default:
2375 INP_WUNLOCK(inp);
2376 error = ENOPROTOOPT;
2377 break;
2378 }
2379 break;
2380
2381 case SOPT_GET:
2382 tp = intotcpcb(inp);
2383 switch (sopt->sopt_name) {
2384 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
2385 case TCP_MD5SIG:
2386 INP_WUNLOCK(inp);
2387 if (!TCPMD5_ENABLED())
2388 return (ENOPROTOOPT);
2389 error = TCPMD5_PCBCTL(inp, sopt);
2390 break;
2391 #endif
2392
2393 case TCP_NODELAY:
2394 optval = tp->t_flags & TF_NODELAY;
2395 INP_WUNLOCK(inp);
2396 error = sooptcopyout(sopt, &optval, sizeof optval);
2397 break;
2398 case TCP_MAXSEG:
2399 optval = tp->t_maxseg;
2400 INP_WUNLOCK(inp);
2401 error = sooptcopyout(sopt, &optval, sizeof optval);
2402 break;
2403 case TCP_REMOTE_UDP_ENCAPS_PORT:
2404 optval = ntohs(tp->t_port);
2405 INP_WUNLOCK(inp);
2406 error = sooptcopyout(sopt, &optval, sizeof optval);
2407 break;
2408 case TCP_NOOPT:
2409 optval = tp->t_flags & TF_NOOPT;
2410 INP_WUNLOCK(inp);
2411 error = sooptcopyout(sopt, &optval, sizeof optval);
2412 break;
2413 case TCP_NOPUSH:
2414 optval = tp->t_flags & TF_NOPUSH;
2415 INP_WUNLOCK(inp);
2416 error = sooptcopyout(sopt, &optval, sizeof optval);
2417 break;
2418 case TCP_INFO:
2419 tcp_fill_info(tp, &ti);
2420 INP_WUNLOCK(inp);
2421 error = sooptcopyout(sopt, &ti, sizeof ti);
2422 break;
2423 case TCP_STATS:
2424 {
2425 #ifdef STATS
2426 int nheld;
2427 TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
2428
2429 error = 0;
2430 socklen_t outsbsz = sopt->sopt_valsize;
2431 if (tp->t_stats == NULL)
2432 error = ENOENT;
2433 else if (outsbsz >= tp->t_stats->cursz)
2434 outsbsz = tp->t_stats->cursz;
2435 else if (outsbsz >= sizeof(struct statsblob))
2436 outsbsz = sizeof(struct statsblob);
2437 else
2438 error = EINVAL;
2439 INP_WUNLOCK(inp);
2440 if (error)
2441 break;
2442
2443 sbp = sopt->sopt_val;
2444 nheld = atop(round_page(((vm_offset_t)sbp) +
2445 (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
2446 vm_page_t ma[nheld];
2447 if (vm_fault_quick_hold_pages(
2448 &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
2449 outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
2450 nheld) < 0) {
2451 error = EFAULT;
2452 break;
2453 }
2454
2455 if ((error = copyin_nofault(&(sbp->flags), &sbflags,
2456 SIZEOF_MEMBER(struct statsblob, flags))))
2457 goto unhold;
2458
2459 INP_WLOCK_RECHECK(inp);
2460 error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
2461 sbflags | SB_CLONE_USRDSTNOFAULT);
2462 INP_WUNLOCK(inp);
2463 sopt->sopt_valsize = outsbsz;
2464 unhold:
2465 vm_page_unhold_pages(ma, nheld);
2466 #else
2467 INP_WUNLOCK(inp);
2468 error = EOPNOTSUPP;
2469 #endif /* !STATS */
2470 break;
2471 }
2472 case TCP_CONGESTION:
2473 len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
2474 INP_WUNLOCK(inp);
2475 error = sooptcopyout(sopt, buf, len + 1);
2476 break;
2477 case TCP_KEEPIDLE:
2478 case TCP_KEEPINTVL:
2479 case TCP_KEEPINIT:
2480 case TCP_KEEPCNT:
2481 switch (sopt->sopt_name) {
2482 case TCP_KEEPIDLE:
2483 ui = TP_KEEPIDLE(tp) / hz;
2484 break;
2485 case TCP_KEEPINTVL:
2486 ui = TP_KEEPINTVL(tp) / hz;
2487 break;
2488 case TCP_KEEPINIT:
2489 ui = TP_KEEPINIT(tp) / hz;
2490 break;
2491 case TCP_KEEPCNT:
2492 ui = TP_KEEPCNT(tp);
2493 break;
2494 }
2495 INP_WUNLOCK(inp);
2496 error = sooptcopyout(sopt, &ui, sizeof(ui));
2497 break;
2498 #ifdef TCPPCAP
2499 case TCP_PCAP_OUT:
2500 case TCP_PCAP_IN:
2501 optval = tcp_pcap_get_sock_max(
2502 (sopt->sopt_name == TCP_PCAP_OUT) ?
2503 &(tp->t_outpkts) : &(tp->t_inpkts));
2504 INP_WUNLOCK(inp);
2505 error = sooptcopyout(sopt, &optval, sizeof optval);
2506 break;
2507 #endif
2508 case TCP_FASTOPEN:
2509 optval = tp->t_flags & TF_FASTOPEN;
2510 INP_WUNLOCK(inp);
2511 error = sooptcopyout(sopt, &optval, sizeof optval);
2512 break;
2513 #ifdef TCP_BLACKBOX
2514 case TCP_LOG:
2515 optval = tp->t_logstate;
2516 INP_WUNLOCK(inp);
2517 error = sooptcopyout(sopt, &optval, sizeof(optval));
2518 break;
2519 case TCP_LOGBUF:
2520 /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
2521 error = tcp_log_getlogbuf(sopt, tp);
2522 break;
2523 case TCP_LOGID:
2524 len = tcp_log_get_id(tp, buf);
2525 INP_WUNLOCK(inp);
2526 error = sooptcopyout(sopt, buf, len + 1);
2527 break;
2528 case TCP_LOGDUMP:
2529 case TCP_LOGDUMPID:
2530 INP_WUNLOCK(inp);
2531 error = EINVAL;
2532 break;
2533 #endif
2534 #ifdef KERN_TLS
2535 case TCP_TXTLS_MODE:
2536 optval = ktls_get_tx_mode(so);
2537 INP_WUNLOCK(inp);
2538 error = sooptcopyout(sopt, &optval, sizeof(optval));
2539 break;
2540 case TCP_RXTLS_MODE:
2541 optval = ktls_get_rx_mode(so);
2542 INP_WUNLOCK(inp);
2543 error = sooptcopyout(sopt, &optval, sizeof(optval));
2544 break;
2545 #endif
2546 default:
2547 INP_WUNLOCK(inp);
2548 error = ENOPROTOOPT;
2549 break;
2550 }
2551 break;
2552 }
2553 return (error);
2554 }
2555 #undef INP_WLOCK_RECHECK
2556 #undef INP_WLOCK_RECHECK_CLEANUP
2557
2558 /*
2559 * Initiate (or continue) disconnect.
2560 * If embryonic state, just send reset (once).
2561 * If in ``let data drain'' option and linger null, just drop.
2562 * Otherwise (hard), mark socket disconnecting and drop
2563 * current input data; switch states based on user close, and
2564 * send segment to peer (with FIN).
2565 */
2566 static void
tcp_disconnect(struct tcpcb * tp)2567 tcp_disconnect(struct tcpcb *tp)
2568 {
2569 struct inpcb *inp = tp->t_inpcb;
2570 struct socket *so = inp->inp_socket;
2571
2572 NET_EPOCH_ASSERT();
2573 INP_WLOCK_ASSERT(inp);
2574
2575 /*
2576 * Neither tcp_close() nor tcp_drop() should return NULL, as the
2577 * socket is still open.
2578 */
2579 if (tp->t_state < TCPS_ESTABLISHED &&
2580 !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) {
2581 tp = tcp_close(tp);
2582 KASSERT(tp != NULL,
2583 ("tcp_disconnect: tcp_close() returned NULL"));
2584 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
2585 tp = tcp_drop(tp, 0);
2586 KASSERT(tp != NULL,
2587 ("tcp_disconnect: tcp_drop() returned NULL"));
2588 } else {
2589 soisdisconnecting(so);
2590 sbflush(&so->so_rcv);
2591 tcp_usrclosed(tp);
2592 if (!(inp->inp_flags & INP_DROPPED))
2593 tp->t_fb->tfb_tcp_output(tp);
2594 }
2595 }
2596
2597 /*
2598 * User issued close, and wish to trail through shutdown states:
2599 * if never received SYN, just forget it. If got a SYN from peer,
2600 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
2601 * If already got a FIN from peer, then almost done; go to LAST_ACK
2602 * state. In all other cases, have already sent FIN to peer (e.g.
2603 * after PRU_SHUTDOWN), and just have to play tedious game waiting
2604 * for peer to send FIN or not respond to keep-alives, etc.
2605 * We can let the user exit from the close as soon as the FIN is acked.
2606 */
2607 static void
tcp_usrclosed(struct tcpcb * tp)2608 tcp_usrclosed(struct tcpcb *tp)
2609 {
2610
2611 NET_EPOCH_ASSERT();
2612 INP_WLOCK_ASSERT(tp->t_inpcb);
2613
2614 switch (tp->t_state) {
2615 case TCPS_LISTEN:
2616 #ifdef TCP_OFFLOAD
2617 tcp_offload_listen_stop(tp);
2618 #endif
2619 tcp_state_change(tp, TCPS_CLOSED);
2620 /* FALLTHROUGH */
2621 case TCPS_CLOSED:
2622 tp = tcp_close(tp);
2623 /*
2624 * tcp_close() should never return NULL here as the socket is
2625 * still open.
2626 */
2627 KASSERT(tp != NULL,
2628 ("tcp_usrclosed: tcp_close() returned NULL"));
2629 break;
2630
2631 case TCPS_SYN_SENT:
2632 case TCPS_SYN_RECEIVED:
2633 tp->t_flags |= TF_NEEDFIN;
2634 break;
2635
2636 case TCPS_ESTABLISHED:
2637 tcp_state_change(tp, TCPS_FIN_WAIT_1);
2638 break;
2639
2640 case TCPS_CLOSE_WAIT:
2641 tcp_state_change(tp, TCPS_LAST_ACK);
2642 break;
2643 }
2644 if (tp->t_state >= TCPS_FIN_WAIT_2) {
2645 soisdisconnected(tp->t_inpcb->inp_socket);
2646 /* Prevent the connection hanging in FIN_WAIT_2 forever. */
2647 if (tp->t_state == TCPS_FIN_WAIT_2) {
2648 int timeout;
2649
2650 timeout = (tcp_fast_finwait2_recycle) ?
2651 tcp_finwait2_timeout : TP_MAXIDLE(tp);
2652 tcp_timer_activate(tp, TT_2MSL, timeout);
2653 }
2654 }
2655 }
2656
2657 #ifdef DDB
2658 static void
db_print_indent(int indent)2659 db_print_indent(int indent)
2660 {
2661 int i;
2662
2663 for (i = 0; i < indent; i++)
2664 db_printf(" ");
2665 }
2666
2667 static void
db_print_tstate(int t_state)2668 db_print_tstate(int t_state)
2669 {
2670
2671 switch (t_state) {
2672 case TCPS_CLOSED:
2673 db_printf("TCPS_CLOSED");
2674 return;
2675
2676 case TCPS_LISTEN:
2677 db_printf("TCPS_LISTEN");
2678 return;
2679
2680 case TCPS_SYN_SENT:
2681 db_printf("TCPS_SYN_SENT");
2682 return;
2683
2684 case TCPS_SYN_RECEIVED:
2685 db_printf("TCPS_SYN_RECEIVED");
2686 return;
2687
2688 case TCPS_ESTABLISHED:
2689 db_printf("TCPS_ESTABLISHED");
2690 return;
2691
2692 case TCPS_CLOSE_WAIT:
2693 db_printf("TCPS_CLOSE_WAIT");
2694 return;
2695
2696 case TCPS_FIN_WAIT_1:
2697 db_printf("TCPS_FIN_WAIT_1");
2698 return;
2699
2700 case TCPS_CLOSING:
2701 db_printf("TCPS_CLOSING");
2702 return;
2703
2704 case TCPS_LAST_ACK:
2705 db_printf("TCPS_LAST_ACK");
2706 return;
2707
2708 case TCPS_FIN_WAIT_2:
2709 db_printf("TCPS_FIN_WAIT_2");
2710 return;
2711
2712 case TCPS_TIME_WAIT:
2713 db_printf("TCPS_TIME_WAIT");
2714 return;
2715
2716 default:
2717 db_printf("unknown");
2718 return;
2719 }
2720 }
2721
2722 static void
db_print_tflags(u_int t_flags)2723 db_print_tflags(u_int t_flags)
2724 {
2725 int comma;
2726
2727 comma = 0;
2728 if (t_flags & TF_ACKNOW) {
2729 db_printf("%sTF_ACKNOW", comma ? ", " : "");
2730 comma = 1;
2731 }
2732 if (t_flags & TF_DELACK) {
2733 db_printf("%sTF_DELACK", comma ? ", " : "");
2734 comma = 1;
2735 }
2736 if (t_flags & TF_NODELAY) {
2737 db_printf("%sTF_NODELAY", comma ? ", " : "");
2738 comma = 1;
2739 }
2740 if (t_flags & TF_NOOPT) {
2741 db_printf("%sTF_NOOPT", comma ? ", " : "");
2742 comma = 1;
2743 }
2744 if (t_flags & TF_SENTFIN) {
2745 db_printf("%sTF_SENTFIN", comma ? ", " : "");
2746 comma = 1;
2747 }
2748 if (t_flags & TF_REQ_SCALE) {
2749 db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
2750 comma = 1;
2751 }
2752 if (t_flags & TF_RCVD_SCALE) {
2753 db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
2754 comma = 1;
2755 }
2756 if (t_flags & TF_REQ_TSTMP) {
2757 db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
2758 comma = 1;
2759 }
2760 if (t_flags & TF_RCVD_TSTMP) {
2761 db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
2762 comma = 1;
2763 }
2764 if (t_flags & TF_SACK_PERMIT) {
2765 db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
2766 comma = 1;
2767 }
2768 if (t_flags & TF_NEEDSYN) {
2769 db_printf("%sTF_NEEDSYN", comma ? ", " : "");
2770 comma = 1;
2771 }
2772 if (t_flags & TF_NEEDFIN) {
2773 db_printf("%sTF_NEEDFIN", comma ? ", " : "");
2774 comma = 1;
2775 }
2776 if (t_flags & TF_NOPUSH) {
2777 db_printf("%sTF_NOPUSH", comma ? ", " : "");
2778 comma = 1;
2779 }
2780 if (t_flags & TF_MORETOCOME) {
2781 db_printf("%sTF_MORETOCOME", comma ? ", " : "");
2782 comma = 1;
2783 }
2784 if (t_flags & TF_LQ_OVERFLOW) {
2785 db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : "");
2786 comma = 1;
2787 }
2788 if (t_flags & TF_LASTIDLE) {
2789 db_printf("%sTF_LASTIDLE", comma ? ", " : "");
2790 comma = 1;
2791 }
2792 if (t_flags & TF_RXWIN0SENT) {
2793 db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
2794 comma = 1;
2795 }
2796 if (t_flags & TF_FASTRECOVERY) {
2797 db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
2798 comma = 1;
2799 }
2800 if (t_flags & TF_CONGRECOVERY) {
2801 db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
2802 comma = 1;
2803 }
2804 if (t_flags & TF_WASFRECOVERY) {
2805 db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
2806 comma = 1;
2807 }
2808 if (t_flags & TF_SIGNATURE) {
2809 db_printf("%sTF_SIGNATURE", comma ? ", " : "");
2810 comma = 1;
2811 }
2812 if (t_flags & TF_FORCEDATA) {
2813 db_printf("%sTF_FORCEDATA", comma ? ", " : "");
2814 comma = 1;
2815 }
2816 if (t_flags & TF_TSO) {
2817 db_printf("%sTF_TSO", comma ? ", " : "");
2818 comma = 1;
2819 }
2820 if (t_flags & TF_FASTOPEN) {
2821 db_printf("%sTF_FASTOPEN", comma ? ", " : "");
2822 comma = 1;
2823 }
2824 }
2825
2826 static void
db_print_tflags2(u_int t_flags2)2827 db_print_tflags2(u_int t_flags2)
2828 {
2829 int comma;
2830
2831 comma = 0;
2832 if (t_flags2 & TF2_ECN_PERMIT) {
2833 db_printf("%sTF2_ECN_PERMIT", comma ? ", " : "");
2834 comma = 1;
2835 }
2836 }
2837
2838 static void
db_print_toobflags(char t_oobflags)2839 db_print_toobflags(char t_oobflags)
2840 {
2841 int comma;
2842
2843 comma = 0;
2844 if (t_oobflags & TCPOOB_HAVEDATA) {
2845 db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
2846 comma = 1;
2847 }
2848 if (t_oobflags & TCPOOB_HADDATA) {
2849 db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
2850 comma = 1;
2851 }
2852 }
2853
2854 static void
db_print_tcpcb(struct tcpcb * tp,const char * name,int indent)2855 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
2856 {
2857
2858 db_print_indent(indent);
2859 db_printf("%s at %p\n", name, tp);
2860
2861 indent += 2;
2862
2863 db_print_indent(indent);
2864 db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n",
2865 TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
2866
2867 db_print_indent(indent);
2868 db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n",
2869 &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);
2870
2871 db_print_indent(indent);
2872 db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl,
2873 &tp->t_timers->tt_delack, tp->t_inpcb);
2874
2875 db_print_indent(indent);
2876 db_printf("t_state: %d (", tp->t_state);
2877 db_print_tstate(tp->t_state);
2878 db_printf(")\n");
2879
2880 db_print_indent(indent);
2881 db_printf("t_flags: 0x%x (", tp->t_flags);
2882 db_print_tflags(tp->t_flags);
2883 db_printf(")\n");
2884
2885 db_print_indent(indent);
2886 db_printf("t_flags2: 0x%x (", tp->t_flags2);
2887 db_print_tflags2(tp->t_flags2);
2888 db_printf(")\n");
2889
2890 db_print_indent(indent);
2891 db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n",
2892 tp->snd_una, tp->snd_max, tp->snd_nxt);
2893
2894 db_print_indent(indent);
2895 db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n",
2896 tp->snd_up, tp->snd_wl1, tp->snd_wl2);
2897
2898 db_print_indent(indent);
2899 db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n",
2900 tp->iss, tp->irs, tp->rcv_nxt);
2901
2902 db_print_indent(indent);
2903 db_printf("rcv_adv: 0x%08x rcv_wnd: %u rcv_up: 0x%08x\n",
2904 tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
2905
2906 db_print_indent(indent);
2907 db_printf("snd_wnd: %u snd_cwnd: %u\n",
2908 tp->snd_wnd, tp->snd_cwnd);
2909
2910 db_print_indent(indent);
2911 db_printf("snd_ssthresh: %u snd_recover: "
2912 "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
2913
2914 db_print_indent(indent);
2915 db_printf("t_rcvtime: %u t_startime: %u\n",
2916 tp->t_rcvtime, tp->t_starttime);
2917
2918 db_print_indent(indent);
2919 db_printf("t_rttime: %u t_rtsq: 0x%08x\n",
2920 tp->t_rtttime, tp->t_rtseq);
2921
2922 db_print_indent(indent);
2923 db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n",
2924 tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
2925
2926 db_print_indent(indent);
2927 db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u\n",
2928 tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin);
2929
2930 db_print_indent(indent);
2931 db_printf("t_rttupdated: %lu max_sndwnd: %u t_softerror: %d\n",
2932 tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
2933
2934 db_print_indent(indent);
2935 db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
2936 db_print_toobflags(tp->t_oobflags);
2937 db_printf(") t_iobc: 0x%02x\n", tp->t_iobc);
2938
2939 db_print_indent(indent);
2940 db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n",
2941 tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
2942
2943 db_print_indent(indent);
2944 db_printf("ts_recent: %u ts_recent_age: %u\n",
2945 tp->ts_recent, tp->ts_recent_age);
2946
2947 db_print_indent(indent);
2948 db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: "
2949 "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
2950
2951 db_print_indent(indent);
2952 db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x "
2953 "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
2954 tp->snd_recover_prev, tp->t_badrxtwin);
2955
2956 db_print_indent(indent);
2957 db_printf("snd_numholes: %d snd_holes first: %p\n",
2958 tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
2959
2960 db_print_indent(indent);
2961 db_printf("snd_fack: 0x%08x rcv_numsacks: %d\n",
2962 tp->snd_fack, tp->rcv_numsacks);
2963
2964 /* Skip sackblks, sackhint. */
2965
2966 db_print_indent(indent);
2967 db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n",
2968 tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
2969 }
2970
DB_SHOW_COMMAND(tcpcb,db_show_tcpcb)2971 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
2972 {
2973 struct tcpcb *tp;
2974
2975 if (!have_addr) {
2976 db_printf("usage: show tcpcb <addr>\n");
2977 return;
2978 }
2979 tp = (struct tcpcb *)addr;
2980
2981 db_print_tcpcb(tp, "tcpcb", 0);
2982 }
2983 #endif
2984