1 /*	$OpenBSD: uipc_socket2.c,v 1.177 2025/02/06 13:39:31 mvs Exp $	*/
2 /*	$NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/protosw.h>
40 #include <sys/domain.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/signalvar.h>
44 #include <sys/pool.h>
45 
46 /*
47  * Primitive routines for operating on sockets and socket buffers
48  */
49 
50 u_long sb_max = SB_MAX;		/* [I] patchable */
51 
52 extern struct pool mclpools[];
53 extern struct pool mbpool;
54 
55 /*
56  * Procedures to manipulate state flags of socket
57  * and do appropriate wakeups.  Normal sequence from the
58  * active (originating) side is that soisconnecting() is
59  * called during processing of connect() call,
60  * resulting in an eventual call to soisconnected() if/when the
61  * connection is established.  When the connection is torn down
62  * soisdisconnecting() is called during processing of disconnect() call,
63  * and soisdisconnected() is called when the connection to the peer
64  * is totally severed.  The semantics of these routines are such that
65  * connectionless protocols can call soisconnected() and soisdisconnected()
66  * only, bypassing the in-progress calls when setting up a ``connection''
67  * takes no time.
68  *
69  * From the passive side, a socket is created with
70  * two queues of sockets: so_q0 for connections in progress
71  * and so_q for connections already made and awaiting user acceptance.
72  * As a protocol is preparing incoming connections, it creates a socket
73  * structure queued on so_q0 by calling sonewconn().  When the connection
74  * is established, soisconnected() is called, and transfers the
75  * socket structure to so_q, making it available to accept().
76  *
77  * If a socket is closed with sockets on either
78  * so_q0 or so_q, these sockets are dropped.
79  *
80  * If higher level protocols are implemented in
81  * the kernel, the wakeups done here will sometimes
82  * cause software-interrupt process scheduling.
83  */
84 
85 void
soisconnecting(struct socket * so)86 soisconnecting(struct socket *so)
87 {
88 	soassertlocked(so);
89 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
90 	so->so_state |= SS_ISCONNECTING;
91 }
92 
93 void
soisconnected(struct socket * so)94 soisconnected(struct socket *so)
95 {
96 	struct socket *head = so->so_head;
97 
98 	soassertlocked(so);
99 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
100 	so->so_state |= SS_ISCONNECTED;
101 
102 	if (head != NULL && so->so_onq == &head->so_q0) {
103 		soref(head);
104 		sounlock(so);
105 		solock(head);
106 		solock(so);
107 
108 		if (so->so_onq != &head->so_q0) {
109 			sounlock(head);
110 			sorele(head);
111 			return;
112 		}
113 
114 		soqremque(so, 0);
115 		soqinsque(head, so, 1);
116 		sorwakeup(head);
117 		wakeup_one(&head->so_timeo);
118 
119 		sounlock(head);
120 		sorele(head);
121 	} else {
122 		wakeup(&so->so_timeo);
123 		sorwakeup(so);
124 		sowwakeup(so);
125 	}
126 }
127 
128 void
soisdisconnecting(struct socket * so)129 soisdisconnecting(struct socket *so)
130 {
131 	soassertlocked(so);
132 	so->so_state &= ~SS_ISCONNECTING;
133 	so->so_state |= SS_ISDISCONNECTING;
134 
135 	mtx_enter(&so->so_rcv.sb_mtx);
136 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
137 	mtx_leave(&so->so_rcv.sb_mtx);
138 
139 	mtx_enter(&so->so_snd.sb_mtx);
140 	so->so_snd.sb_state |= SS_CANTSENDMORE;
141 	mtx_leave(&so->so_snd.sb_mtx);
142 
143 	wakeup(&so->so_timeo);
144 	sowwakeup(so);
145 	sorwakeup(so);
146 }
147 
148 void
soisdisconnected(struct socket * so)149 soisdisconnected(struct socket *so)
150 {
151 	soassertlocked(so);
152 
153 	mtx_enter(&so->so_rcv.sb_mtx);
154 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
155 	mtx_leave(&so->so_rcv.sb_mtx);
156 
157 	mtx_enter(&so->so_snd.sb_mtx);
158 	so->so_snd.sb_state |= SS_CANTSENDMORE;
159 	mtx_leave(&so->so_snd.sb_mtx);
160 
161 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
162 	so->so_state |= SS_ISDISCONNECTED;
163 
164 	wakeup(&so->so_timeo);
165 	sowwakeup(so);
166 	sorwakeup(so);
167 }
168 
169 /*
170  * When an attempt at a new connection is noted on a socket
171  * which accepts connections, sonewconn is called.  If the
172  * connection is possible (subject to space constraints, etc.)
173  * then we allocate a new structure, properly linked into the
174  * data structure of the original socket, and return this.
175  * Connstatus may be 0 or SS_ISCONNECTED.
176  */
177 struct socket *
sonewconn(struct socket * head,int connstatus,int wait)178 sonewconn(struct socket *head, int connstatus, int wait)
179 {
180 	struct socket *so;
181 	int soqueue = connstatus ? 1 : 0;
182 
183 	soassertlocked(head);
184 
185 	if (m_pool_used() > 95)
186 		return (NULL);
187 	if (head->so_qlen + head->so_q0len > head->so_qlimit * 3)
188 		return (NULL);
189 	so = soalloc(head->so_proto, wait);
190 	if (so == NULL)
191 		return (NULL);
192 	so->so_type = head->so_type;
193 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
194 	so->so_linger = head->so_linger;
195 	so->so_state = head->so_state | SS_NOFDREF;
196 	so->so_proto = head->so_proto;
197 	so->so_timeo = head->so_timeo;
198 	so->so_euid = head->so_euid;
199 	so->so_ruid = head->so_ruid;
200 	so->so_egid = head->so_egid;
201 	so->so_rgid = head->so_rgid;
202 	so->so_cpid = head->so_cpid;
203 
204 	/*
205 	 * Lock order will be `head' -> `so' while these sockets are linked.
206 	 */
207 	solock_nonet(so);
208 
209 	/*
210 	 * Inherit watermarks but those may get clamped in low mem situations.
211 	 */
212 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat))
213 		goto fail;
214 
215 	mtx_enter(&head->so_snd.sb_mtx);
216 	so->so_snd.sb_wat = head->so_snd.sb_wat;
217 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
218 	so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs;
219 	mtx_leave(&head->so_snd.sb_mtx);
220 
221 	mtx_enter(&head->so_rcv.sb_mtx);
222 	so->so_rcv.sb_wat = head->so_rcv.sb_wat;
223 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
224 	so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
225 	mtx_leave(&head->so_rcv.sb_mtx);
226 
227 	sigio_copy(&so->so_sigio, &head->so_sigio);
228 
229 	soqinsque(head, so, soqueue);
230 	if (pru_attach(so, 0, wait) != 0) {
231 		soqremque(so, soqueue);
232 		goto fail;
233 	}
234 	if (connstatus) {
235 		so->so_state |= connstatus;
236 		sorwakeup(head);
237 		wakeup(&head->so_timeo);
238 	}
239 
240 	return (so);
241 
242 fail:
243 	sounlock_nonet(so);
244 	sigio_free(&so->so_sigio);
245 	klist_free(&so->so_rcv.sb_klist);
246 	klist_free(&so->so_snd.sb_klist);
247 	pool_put(&socket_pool, so);
248 
249 	return (NULL);
250 }
251 
252 void
soqinsque(struct socket * head,struct socket * so,int q)253 soqinsque(struct socket *head, struct socket *so, int q)
254 {
255 	soassertlocked(head);
256 	soassertlocked(so);
257 
258 	KASSERT(so->so_onq == NULL);
259 
260 	so->so_head = head;
261 	if (q == 0) {
262 		head->so_q0len++;
263 		so->so_onq = &head->so_q0;
264 	} else {
265 		head->so_qlen++;
266 		so->so_onq = &head->so_q;
267 	}
268 	TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
269 }
270 
271 int
soqremque(struct socket * so,int q)272 soqremque(struct socket *so, int q)
273 {
274 	struct socket *head = so->so_head;
275 
276 	soassertlocked(so);
277 	soassertlocked(head);
278 
279 	if (q == 0) {
280 		if (so->so_onq != &head->so_q0)
281 			return (0);
282 		head->so_q0len--;
283 	} else {
284 		if (so->so_onq != &head->so_q)
285 			return (0);
286 		head->so_qlen--;
287 	}
288 	TAILQ_REMOVE(so->so_onq, so, so_qe);
289 	so->so_onq = NULL;
290 	so->so_head = NULL;
291 	return (1);
292 }
293 
294 /*
295  * Socantsendmore indicates that no more data will be sent on the
296  * socket; it would normally be applied to a socket when the user
297  * informs the system that no more data is to be sent, by the protocol
298  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
299  * will be received, and will normally be applied to the socket by a
300  * protocol when it detects that the peer will send no more data.
301  * Data queued for reading in the socket may yet be read.
302  */
303 
304 void
socantsendmore(struct socket * so)305 socantsendmore(struct socket *so)
306 {
307 	soassertlocked(so);
308 	mtx_enter(&so->so_snd.sb_mtx);
309 	so->so_snd.sb_state |= SS_CANTSENDMORE;
310 	mtx_leave(&so->so_snd.sb_mtx);
311 	sowwakeup(so);
312 }
313 
314 void
socantrcvmore(struct socket * so)315 socantrcvmore(struct socket *so)
316 {
317 	mtx_enter(&so->so_rcv.sb_mtx);
318 	so->so_rcv.sb_state |= SS_CANTRCVMORE;
319 	mtx_leave(&so->so_rcv.sb_mtx);
320 	sorwakeup(so);
321 }
322 
323 void
solock(struct socket * so)324 solock(struct socket *so)
325 {
326 	switch (so->so_proto->pr_domain->dom_family) {
327 	case PF_INET:
328 	case PF_INET6:
329 		NET_LOCK();
330 		break;
331 	default:
332 		rw_enter_write(&so->so_lock);
333 		break;
334 	}
335 }
336 
337 void
solock_shared(struct socket * so)338 solock_shared(struct socket *so)
339 {
340 	switch (so->so_proto->pr_domain->dom_family) {
341 	case PF_INET:
342 	case PF_INET6:
343 		NET_LOCK_SHARED();
344 		break;
345 	}
346 	rw_enter_write(&so->so_lock);
347 }
348 
349 void
solock_nonet(struct socket * so)350 solock_nonet(struct socket *so)
351 {
352 	switch (so->so_proto->pr_domain->dom_family) {
353 	case PF_INET:
354 	case PF_INET6:
355 		NET_ASSERT_LOCKED();
356 		break;
357 	}
358 	rw_enter_write(&so->so_lock);
359 }
360 
361 int
solock_persocket(struct socket * so)362 solock_persocket(struct socket *so)
363 {
364 	switch (so->so_proto->pr_domain->dom_family) {
365 	case PF_INET:
366 	case PF_INET6:
367 		return 0;
368 	default:
369 		return 1;
370 	}
371 }
372 
373 void
solock_pair(struct socket * so1,struct socket * so2)374 solock_pair(struct socket *so1, struct socket *so2)
375 {
376 	KASSERT(so1->so_type == so2->so_type);
377 
378 	switch (so1->so_proto->pr_domain->dom_family) {
379 	case PF_INET:
380 	case PF_INET6:
381 		NET_LOCK_SHARED();
382 		break;
383 	}
384 	if (so1 == so2) {
385 		rw_enter_write(&so1->so_lock);
386 	} else if (so1 < so2) {
387 		rw_enter_write(&so1->so_lock);
388 		rw_enter_write(&so2->so_lock);
389 	} else {
390 		rw_enter_write(&so2->so_lock);
391 		rw_enter_write(&so1->so_lock);
392 	}
393 }
394 
395 void
sounlock(struct socket * so)396 sounlock(struct socket *so)
397 {
398 	switch (so->so_proto->pr_domain->dom_family) {
399 	case PF_INET:
400 	case PF_INET6:
401 		NET_UNLOCK();
402 		break;
403 	default:
404 		rw_exit_write(&so->so_lock);
405 		break;
406 	}
407 }
408 
409 void
sounlock_shared(struct socket * so)410 sounlock_shared(struct socket *so)
411 {
412 	rw_exit_write(&so->so_lock);
413 	switch (so->so_proto->pr_domain->dom_family) {
414 	case PF_INET:
415 	case PF_INET6:
416 		NET_UNLOCK_SHARED();
417 		break;
418 	}
419 }
420 
421 void
sounlock_nonet(struct socket * so)422 sounlock_nonet(struct socket *so)
423 {
424 	rw_exit_write(&so->so_lock);
425 }
426 
427 void
sounlock_pair(struct socket * so1,struct socket * so2)428 sounlock_pair(struct socket *so1, struct socket *so2)
429 {
430 	if (so1 == so2)
431 		rw_exit_write(&so1->so_lock);
432 	else if (so1 < so2) {
433 		rw_exit_write(&so2->so_lock);
434 		rw_exit_write(&so1->so_lock);
435 	} else {
436 		rw_exit_write(&so1->so_lock);
437 		rw_exit_write(&so2->so_lock);
438 	}
439 	switch (so1->so_proto->pr_domain->dom_family) {
440 	case PF_INET:
441 	case PF_INET6:
442 		NET_UNLOCK_SHARED();
443 		break;
444 	}
445 }
446 
447 void
soassertlocked_readonly(struct socket * so)448 soassertlocked_readonly(struct socket *so)
449 {
450 	switch (so->so_proto->pr_domain->dom_family) {
451 	case PF_INET:
452 	case PF_INET6:
453 		NET_ASSERT_LOCKED();
454 		break;
455 	default:
456 		rw_assert_wrlock(&so->so_lock);
457 		break;
458 	}
459 }
460 
461 void
soassertlocked(struct socket * so)462 soassertlocked(struct socket *so)
463 {
464 	switch (so->so_proto->pr_domain->dom_family) {
465 	case PF_INET:
466 	case PF_INET6:
467 		if (rw_status(&netlock) == RW_READ) {
468 			NET_ASSERT_LOCKED();
469 
470 			if (splassert_ctl > 0 &&
471 			    rw_status(&so->so_lock) != RW_WRITE)
472 				splassert_fail(0, RW_WRITE, __func__);
473 		} else
474 			NET_ASSERT_LOCKED_EXCLUSIVE();
475 		break;
476 	default:
477 		rw_assert_wrlock(&so->so_lock);
478 		break;
479 	}
480 }
481 
482 int
sosleep_nsec(struct socket * so,void * ident,int prio,const char * wmesg,uint64_t nsecs)483 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
484     uint64_t nsecs)
485 {
486 	int ret;
487 
488 	switch (so->so_proto->pr_domain->dom_family) {
489 	case PF_INET:
490 	case PF_INET6:
491 		if (rw_status(&netlock) == RW_READ)
492 			rw_exit_write(&so->so_lock);
493 		ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
494 		if (rw_status(&netlock) == RW_READ)
495 			rw_enter_write(&so->so_lock);
496 		break;
497 	default:
498 		ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
499 		break;
500 	}
501 
502 	return ret;
503 }
504 
505 void
sbmtxassertlocked(struct sockbuf * sb)506 sbmtxassertlocked(struct sockbuf *sb)
507 {
508 	if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0)
509 		splassert_fail(0, RW_WRITE, __func__);
510 }
511 
512 /*
513  * Wait for data to arrive at/drain from a socket buffer.
514  */
515 int
sbwait(struct sockbuf * sb)516 sbwait(struct sockbuf *sb)
517 {
518 	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
519 
520 	MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
521 
522 	sb->sb_flags |= SB_WAIT;
523 	return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait",
524 	    sb->sb_timeo_nsecs);
525 }
526 
527 int
sblock(struct sockbuf * sb,int flags)528 sblock(struct sockbuf *sb, int flags)
529 {
530 	int rwflags = RW_WRITE, error;
531 
532 	if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR))
533 		rwflags |= RW_INTR;
534 	if (!(flags & SBL_WAIT))
535 		rwflags |= RW_NOSLEEP;
536 
537 	error = rw_enter(&sb->sb_lock, rwflags);
538 	if (error == EBUSY)
539 		error = EWOULDBLOCK;
540 
541 	return error;
542 }
543 
544 void
sbunlock(struct sockbuf * sb)545 sbunlock(struct sockbuf *sb)
546 {
547 	rw_exit(&sb->sb_lock);
548 }
549 
550 /*
551  * Wakeup processes waiting on a socket buffer.
552  * Do asynchronous notification via SIGIO
553  * if the socket buffer has the SB_ASYNC flag set.
554  */
555 void
sowakeup(struct socket * so,struct sockbuf * sb)556 sowakeup(struct socket *so, struct sockbuf *sb)
557 {
558 	int dowakeup = 0, dopgsigio = 0;
559 
560 	mtx_enter(&sb->sb_mtx);
561 	if (sb->sb_flags & SB_WAIT) {
562 		sb->sb_flags &= ~SB_WAIT;
563 		dowakeup = 1;
564 	}
565 	if (sb->sb_flags & SB_ASYNC)
566 		dopgsigio = 1;
567 
568 	knote_locked(&sb->sb_klist, 0);
569 	mtx_leave(&sb->sb_mtx);
570 
571 	if (dowakeup)
572 		wakeup(&sb->sb_cc);
573 
574 	if (dopgsigio)
575 		pgsigio(&so->so_sigio, SIGIO, 0);
576 }
577 
578 /*
579  * Socket buffer (struct sockbuf) utility routines.
580  *
581  * Each socket contains two socket buffers: one for sending data and
582  * one for receiving data.  Each buffer contains a queue of mbufs,
583  * information about the number of mbufs and amount of data in the
584  * queue, and other fields allowing select() statements and notification
585  * on data availability to be implemented.
586  *
587  * Data stored in a socket buffer is maintained as a list of records.
588  * Each record is a list of mbufs chained together with the m_next
589  * field.  Records are chained together with the m_nextpkt field. The upper
590  * level routine soreceive() expects the following conventions to be
591  * observed when placing information in the receive buffer:
592  *
593  * 1. If the protocol requires each message be preceded by the sender's
594  *    name, then a record containing that name must be present before
595  *    any associated data (mbuf's must be of type MT_SONAME).
596  * 2. If the protocol supports the exchange of ``access rights'' (really
597  *    just additional data associated with the message), and there are
598  *    ``rights'' to be received, then a record containing this data
599  *    should be present (mbuf's must be of type MT_CONTROL).
600  * 3. If a name or rights record exists, then it must be followed by
601  *    a data record, perhaps of zero length.
602  *
603  * Before using a new socket structure it is first necessary to reserve
604  * buffer space to the socket, by calling sbreserve().  This should commit
605  * some of the available buffer space in the system buffer pool for the
606  * socket (currently, it does nothing but enforce limits).  The space
607  * should be released by calling sbrelease() when the socket is destroyed.
608  */
609 
610 int
soreserve(struct socket * so,u_long sndcc,u_long rcvcc)611 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
612 {
613 	soassertlocked(so);
614 
615 	mtx_enter(&so->so_rcv.sb_mtx);
616 	mtx_enter(&so->so_snd.sb_mtx);
617 	if (sbreserve(so, &so->so_snd, sndcc))
618 		goto bad;
619 	so->so_snd.sb_wat = sndcc;
620 	if (so->so_snd.sb_lowat == 0)
621 		so->so_snd.sb_lowat = MCLBYTES;
622 	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
623 		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
624 	if (sbreserve(so, &so->so_rcv, rcvcc))
625 		goto bad2;
626 	so->so_rcv.sb_wat = rcvcc;
627 	if (so->so_rcv.sb_lowat == 0)
628 		so->so_rcv.sb_lowat = 1;
629 	mtx_leave(&so->so_snd.sb_mtx);
630 	mtx_leave(&so->so_rcv.sb_mtx);
631 
632 	return (0);
633 bad2:
634 	sbrelease(so, &so->so_snd);
635 bad:
636 	mtx_leave(&so->so_snd.sb_mtx);
637 	mtx_leave(&so->so_rcv.sb_mtx);
638 	return (ENOBUFS);
639 }
640 
641 /*
642  * Allot mbufs to a sockbuf.
643  * Attempt to scale mbmax so that mbcnt doesn't become limiting
644  * if buffering efficiency is near the normal case.
645  */
646 int
sbreserve(struct socket * so,struct sockbuf * sb,u_long cc)647 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
648 {
649 	sbmtxassertlocked(sb);
650 
651 	if (cc == 0 || cc > sb_max)
652 		return (1);
653 	sb->sb_hiwat = cc;
654 	sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8);
655 	if (sb->sb_lowat > sb->sb_hiwat)
656 		sb->sb_lowat = sb->sb_hiwat;
657 	return (0);
658 }
659 
660 /*
661  * In low memory situation, do not accept any greater than normal request.
662  */
663 int
sbcheckreserve(u_long cnt,u_long defcnt)664 sbcheckreserve(u_long cnt, u_long defcnt)
665 {
666 	if (cnt > defcnt && sbchecklowmem())
667 		return (ENOBUFS);
668 	return (0);
669 }
670 
671 int
sbchecklowmem(void)672 sbchecklowmem(void)
673 {
674 	static int sblowmem;
675 	unsigned int used;
676 
677 	/*
678 	 * m_pool_used() is thread safe.  Global variable sblowmem is updated
679 	 * by multiple CPUs, but most times with the same value.  And even
680 	 * if the value is not correct for a short time, it does not matter.
681 	 */
682 	used = m_pool_used();
683 	if (used < 60)
684 		atomic_store_int(&sblowmem, 0);
685 	else if (used > 80)
686 		atomic_store_int(&sblowmem, 1);
687 
688 	return (atomic_load_int(&sblowmem));
689 }
690 
691 /*
692  * Free mbufs held by a socket, and reserved mbuf space.
693  */
694 void
sbrelease(struct socket * so,struct sockbuf * sb)695 sbrelease(struct socket *so, struct sockbuf *sb)
696 {
697 
698 	sbflush(sb);
699 	sb->sb_hiwat = sb->sb_mbmax = 0;
700 }
701 
702 /*
703  * Routines to add and remove
704  * data from an mbuf queue.
705  *
706  * The routines sbappend() or sbappendrecord() are normally called to
707  * append new mbufs to a socket buffer, after checking that adequate
708  * space is available, comparing the function sbspace() with the amount
709  * of data to be added.  sbappendrecord() differs from sbappend() in
710  * that data supplied is treated as the beginning of a new record.
711  * To place a sender's address, optional access rights, and data in a
712  * socket receive buffer, sbappendaddr() should be used.  To place
713  * access rights and data in a socket receive buffer, sbappendrights()
714  * should be used.  In either case, the new data begins a new record.
715  * Note that unlike sbappend() and sbappendrecord(), these routines check
716  * for the caller that there will be enough space to store the data.
717  * Each fails if there is not enough space, or if it cannot find mbufs
718  * to store additional information in.
719  *
720  * Reliable protocols may use the socket send buffer to hold data
721  * awaiting acknowledgement.  Data is normally copied from a socket
722  * send buffer in a protocol with m_copym for output to a peer,
723  * and then removing the data from the socket buffer with sbdrop()
724  * or sbdroprecord() when the data is acknowledged by the peer.
725  */
726 
727 #ifdef SOCKBUF_DEBUG
728 void
sblastrecordchk(struct sockbuf * sb,const char * where)729 sblastrecordchk(struct sockbuf *sb, const char *where)
730 {
731 	struct mbuf *m = sb->sb_mb;
732 
733 	while (m && m->m_nextpkt)
734 		m = m->m_nextpkt;
735 
736 	if (m != sb->sb_lastrecord) {
737 		printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
738 		    sb->sb_mb, sb->sb_lastrecord, m);
739 		printf("packet chain:\n");
740 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
741 			printf("\t%p\n", m);
742 		panic("sblastrecordchk from %s", where);
743 	}
744 }
745 
746 void
sblastmbufchk(struct sockbuf * sb,const char * where)747 sblastmbufchk(struct sockbuf *sb, const char *where)
748 {
749 	struct mbuf *m = sb->sb_mb;
750 	struct mbuf *n;
751 
752 	while (m && m->m_nextpkt)
753 		m = m->m_nextpkt;
754 
755 	while (m && m->m_next)
756 		m = m->m_next;
757 
758 	if (m != sb->sb_mbtail) {
759 		printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
760 		    sb->sb_mb, sb->sb_mbtail, m);
761 		printf("packet tree:\n");
762 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
763 			printf("\t");
764 			for (n = m; n != NULL; n = n->m_next)
765 				printf("%p ", n);
766 			printf("\n");
767 		}
768 		panic("sblastmbufchk from %s", where);
769 	}
770 }
771 #endif /* SOCKBUF_DEBUG */
772 
773 #define	SBLINKRECORD(sb, m0)						\
774 do {									\
775 	if ((sb)->sb_lastrecord != NULL)				\
776 		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
777 	else								\
778 		(sb)->sb_mb = (m0);					\
779 	(sb)->sb_lastrecord = (m0);					\
780 } while (/*CONSTCOND*/0)
781 
782 /*
783  * Append mbuf chain m to the last record in the
784  * socket buffer sb.  The additional space associated
785  * the mbuf chain is recorded in sb.  Empty mbufs are
786  * discarded and mbufs are compacted where possible.
787  */
788 void
sbappend(struct socket * so,struct sockbuf * sb,struct mbuf * m)789 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m)
790 {
791 	struct mbuf *n;
792 
793 	if (m == NULL)
794 		return;
795 
796 	sbmtxassertlocked(sb);
797 	SBLASTRECORDCHK(sb, "sbappend 1");
798 
799 	if ((n = sb->sb_lastrecord) != NULL) {
800 		/*
801 		 * XXX Would like to simply use sb_mbtail here, but
802 		 * XXX I need to verify that I won't miss an EOR that
803 		 * XXX way.
804 		 */
805 		do {
806 			if (n->m_flags & M_EOR) {
807 				sbappendrecord(so, sb, m); /* XXXXXX!!!! */
808 				return;
809 			}
810 		} while (n->m_next && (n = n->m_next));
811 	} else {
812 		/*
813 		 * If this is the first record in the socket buffer, it's
814 		 * also the last record.
815 		 */
816 		sb->sb_lastrecord = m;
817 	}
818 	sbcompress(sb, m, n);
819 	SBLASTRECORDCHK(sb, "sbappend 2");
820 }
821 
822 /*
823  * This version of sbappend() should only be used when the caller
824  * absolutely knows that there will never be more than one record
825  * in the socket buffer, that is, a stream protocol (such as TCP).
826  */
827 void
sbappendstream(struct socket * so,struct sockbuf * sb,struct mbuf * m)828 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
829 {
830 	sbmtxassertlocked(sb);
831 	KDASSERT(m->m_nextpkt == NULL);
832 	KASSERT(sb->sb_mb == sb->sb_lastrecord);
833 
834 	SBLASTMBUFCHK(sb, __func__);
835 
836 	sbcompress(sb, m, sb->sb_mbtail);
837 
838 	sb->sb_lastrecord = sb->sb_mb;
839 	SBLASTRECORDCHK(sb, __func__);
840 }
841 
842 #ifdef SOCKBUF_DEBUG
843 void
sbcheck(struct socket * so,struct sockbuf * sb)844 sbcheck(struct socket *so, struct sockbuf *sb)
845 {
846 	struct mbuf *m, *n;
847 	u_long len = 0, mbcnt = 0;
848 
849 	for (m = sb->sb_mb; m; m = m->m_nextpkt) {
850 		for (n = m; n; n = n->m_next) {
851 			len += n->m_len;
852 			mbcnt += MSIZE;
853 			if (n->m_flags & M_EXT)
854 				mbcnt += n->m_ext.ext_size;
855 			if (m != n && n->m_nextpkt)
856 				panic("sbcheck nextpkt");
857 		}
858 	}
859 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
860 		printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
861 		    mbcnt, sb->sb_mbcnt);
862 		panic("sbcheck");
863 	}
864 }
865 #endif
866 
867 /*
868  * As above, except the mbuf chain
869  * begins a new record.
870  */
871 void
sbappendrecord(struct socket * so,struct sockbuf * sb,struct mbuf * m0)872 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0)
873 {
874 	struct mbuf *m;
875 
876 	sbmtxassertlocked(sb);
877 
878 	if (m0 == NULL)
879 		return;
880 
881 	/*
882 	 * Put the first mbuf on the queue.
883 	 * Note this permits zero length records.
884 	 */
885 	sballoc(sb, m0);
886 	SBLASTRECORDCHK(sb, "sbappendrecord 1");
887 	SBLINKRECORD(sb, m0);
888 	m = m0->m_next;
889 	m0->m_next = NULL;
890 	if (m && (m0->m_flags & M_EOR)) {
891 		m0->m_flags &= ~M_EOR;
892 		m->m_flags |= M_EOR;
893 	}
894 	sbcompress(sb, m, m0);
895 	SBLASTRECORDCHK(sb, "sbappendrecord 2");
896 }
897 
898 /*
899  * Append address and data, and optionally, control (ancillary) data
900  * to the receive queue of a socket.  If present,
901  * m0 must include a packet header with total length.
902  * Returns 0 if no space in sockbuf or insufficient mbufs.
903  */
904 int
sbappendaddr(struct socket * so,struct sockbuf * sb,const struct sockaddr * asa,struct mbuf * m0,struct mbuf * control)905 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa,
906     struct mbuf *m0, struct mbuf *control)
907 {
908 	struct mbuf *m, *n, *nlast;
909 	int space = asa->sa_len;
910 
911 	sbmtxassertlocked(sb);
912 
913 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
914 		panic("sbappendaddr");
915 	if (m0)
916 		space += m0->m_pkthdr.len;
917 	for (n = control; n; n = n->m_next) {
918 		space += n->m_len;
919 		if (n->m_next == NULL)	/* keep pointer to last control buf */
920 			break;
921 	}
922 	if (space > sbspace_locked(sb))
923 		return (0);
924 	if (asa->sa_len > MLEN)
925 		return (0);
926 	MGET(m, M_DONTWAIT, MT_SONAME);
927 	if (m == NULL)
928 		return (0);
929 	m->m_len = asa->sa_len;
930 	memcpy(mtod(m, caddr_t), asa, asa->sa_len);
931 	if (n)
932 		n->m_next = m0;		/* concatenate data to control */
933 	else
934 		control = m0;
935 	m->m_next = control;
936 
937 	SBLASTRECORDCHK(sb, "sbappendaddr 1");
938 
939 	for (n = m; n->m_next != NULL; n = n->m_next)
940 		sballoc(sb, n);
941 	sballoc(sb, n);
942 	nlast = n;
943 	SBLINKRECORD(sb, m);
944 
945 	sb->sb_mbtail = nlast;
946 	SBLASTMBUFCHK(sb, "sbappendaddr");
947 
948 	SBLASTRECORDCHK(sb, "sbappendaddr 2");
949 
950 	return (1);
951 }
952 
953 int
sbappendcontrol(struct socket * so,struct sockbuf * sb,struct mbuf * m0,struct mbuf * control)954 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0,
955     struct mbuf *control)
956 {
957 	struct mbuf *m, *mlast, *n;
958 	int eor = 0, space = 0;
959 
960 	sbmtxassertlocked(sb);
961 
962 	if (control == NULL)
963 		panic("sbappendcontrol");
964 	for (m = control; ; m = m->m_next) {
965 		space += m->m_len;
966 		if (m->m_next == NULL)
967 			break;
968 	}
969 	n = m;			/* save pointer to last control buffer */
970 	for (m = m0; m; m = m->m_next) {
971 		space += m->m_len;
972 		eor |= m->m_flags & M_EOR;
973 		if (eor) {
974 			if (m->m_next == NULL)
975 				m->m_flags |= M_EOR;
976 			else
977 				m->m_flags &= ~M_EOR;
978 		}
979 	}
980 	if (space > sbspace_locked(sb))
981 		return (0);
982 	n->m_next = m0;			/* concatenate data to control */
983 
984 	SBLASTRECORDCHK(sb, "sbappendcontrol 1");
985 
986 	for (m = control; m->m_next != NULL; m = m->m_next)
987 		sballoc(sb, m);
988 	sballoc(sb, m);
989 	mlast = m;
990 	SBLINKRECORD(sb, control);
991 
992 	sb->sb_mbtail = mlast;
993 	SBLASTMBUFCHK(sb, "sbappendcontrol");
994 
995 	SBLASTRECORDCHK(sb, "sbappendcontrol 2");
996 
997 	return (1);
998 }
999 
1000 /*
1001  * Compress mbuf chain m into the socket
1002  * buffer sb following mbuf n.  If n
1003  * is null, the buffer is presumed empty.
1004  */
1005 void
sbcompress(struct sockbuf * sb,struct mbuf * m,struct mbuf * n)1006 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1007 {
1008 	int eor = 0;
1009 	struct mbuf *o;
1010 
1011 	while (m) {
1012 		eor |= m->m_flags & M_EOR;
1013 		if (m->m_len == 0 &&
1014 		    (eor == 0 ||
1015 		    (((o = m->m_next) || (o = n)) &&
1016 		    o->m_type == m->m_type))) {
1017 			if (sb->sb_lastrecord == m)
1018 				sb->sb_lastrecord = m->m_next;
1019 			m = m_free(m);
1020 			continue;
1021 		}
1022 		if (n && (n->m_flags & M_EOR) == 0 &&
1023 		    /* m_trailingspace() checks buffer writeability */
1024 		    m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size :
1025 		       MCLBYTES) / 4 && /* XXX Don't copy too much */
1026 		    m->m_len <= m_trailingspace(n) &&
1027 		    n->m_type == m->m_type) {
1028 			memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t),
1029 			    m->m_len);
1030 			n->m_len += m->m_len;
1031 			sb->sb_cc += m->m_len;
1032 			if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
1033 				sb->sb_datacc += m->m_len;
1034 			m = m_free(m);
1035 			continue;
1036 		}
1037 		if (n)
1038 			n->m_next = m;
1039 		else
1040 			sb->sb_mb = m;
1041 		sb->sb_mbtail = m;
1042 		sballoc(sb, m);
1043 		n = m;
1044 		m->m_flags &= ~M_EOR;
1045 		m = m->m_next;
1046 		n->m_next = NULL;
1047 	}
1048 	if (eor) {
1049 		if (n)
1050 			n->m_flags |= eor;
1051 		else
1052 			printf("semi-panic: sbcompress");
1053 	}
1054 	SBLASTMBUFCHK(sb, __func__);
1055 }
1056 
1057 /*
1058  * Free all mbufs in a sockbuf.
1059  * Check that all resources are reclaimed.
1060  */
1061 void
sbflush(struct sockbuf * sb)1062 sbflush(struct sockbuf *sb)
1063 {
1064 	rw_assert_unlocked(&sb->sb_lock);
1065 
1066 	while (sb->sb_mbcnt)
1067 		sbdrop(sb, (int)sb->sb_cc);
1068 
1069 	KASSERT(sb->sb_cc == 0);
1070 	KASSERT(sb->sb_datacc == 0);
1071 	KASSERT(sb->sb_mb == NULL);
1072 	KASSERT(sb->sb_mbtail == NULL);
1073 	KASSERT(sb->sb_lastrecord == NULL);
1074 }
1075 
1076 /*
1077  * Drop data from (the front of) a sockbuf.
1078  */
1079 void
sbdrop(struct sockbuf * sb,int len)1080 sbdrop(struct sockbuf *sb, int len)
1081 {
1082 	struct mbuf *m, *mn;
1083 	struct mbuf *next;
1084 
1085 	sbmtxassertlocked(sb);
1086 
1087 	next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
1088 	while (len > 0) {
1089 		if (m == NULL) {
1090 			if (next == NULL)
1091 				panic("sbdrop");
1092 			m = next;
1093 			next = m->m_nextpkt;
1094 			continue;
1095 		}
1096 		if (m->m_len > len) {
1097 			m->m_len -= len;
1098 			m->m_data += len;
1099 			sb->sb_cc -= len;
1100 			if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
1101 				sb->sb_datacc -= len;
1102 			break;
1103 		}
1104 		len -= m->m_len;
1105 		sbfree(sb, m);
1106 		mn = m_free(m);
1107 		m = mn;
1108 	}
1109 	while (m && m->m_len == 0) {
1110 		sbfree(sb, m);
1111 		mn = m_free(m);
1112 		m = mn;
1113 	}
1114 	if (m) {
1115 		sb->sb_mb = m;
1116 		m->m_nextpkt = next;
1117 	} else
1118 		sb->sb_mb = next;
1119 	/*
1120 	 * First part is an inline SB_EMPTY_FIXUP().  Second part
1121 	 * makes sure sb_lastrecord is up-to-date if we dropped
1122 	 * part of the last record.
1123 	 */
1124 	m = sb->sb_mb;
1125 	if (m == NULL) {
1126 		sb->sb_mbtail = NULL;
1127 		sb->sb_lastrecord = NULL;
1128 	} else if (m->m_nextpkt == NULL)
1129 		sb->sb_lastrecord = m;
1130 }
1131 
1132 /*
1133  * Drop a record off the front of a sockbuf
1134  * and move the next record to the front.
1135  */
1136 void
sbdroprecord(struct sockbuf * sb)1137 sbdroprecord(struct sockbuf *sb)
1138 {
1139 	struct mbuf *m, *mn;
1140 
1141 	m = sb->sb_mb;
1142 	if (m) {
1143 		sb->sb_mb = m->m_nextpkt;
1144 		do {
1145 			sbfree(sb, m);
1146 			mn = m_free(m);
1147 		} while ((m = mn) != NULL);
1148 	}
1149 	SB_EMPTY_FIXUP(sb);
1150 }
1151 
1152 /*
1153  * Create a "control" mbuf containing the specified data
1154  * with the specified type for presentation on a socket buffer.
1155  */
1156 struct mbuf *
sbcreatecontrol(const void * p,size_t size,int type,int level)1157 sbcreatecontrol(const void *p, size_t size, int type, int level)
1158 {
1159 	struct cmsghdr *cp;
1160 	struct mbuf *m;
1161 
1162 	if (CMSG_SPACE(size) > MCLBYTES) {
1163 		printf("sbcreatecontrol: message too large %zu\n", size);
1164 		return (NULL);
1165 	}
1166 
1167 	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1168 		return (NULL);
1169 	if (CMSG_SPACE(size) > MLEN) {
1170 		MCLGET(m, M_DONTWAIT);
1171 		if ((m->m_flags & M_EXT) == 0) {
1172 			m_free(m);
1173 			return NULL;
1174 		}
1175 	}
1176 	cp = mtod(m, struct cmsghdr *);
1177 	memset(cp, 0, CMSG_SPACE(size));
1178 	memcpy(CMSG_DATA(cp), p, size);
1179 	m->m_len = CMSG_SPACE(size);
1180 	cp->cmsg_len = CMSG_LEN(size);
1181 	cp->cmsg_level = level;
1182 	cp->cmsg_type = type;
1183 	return (m);
1184 }
1185