xref: /dragonfly/sys/kern/uipc_sockbuf.c (revision b272101acc636ac635f83d03265ef6a44a3ba51a)
1 /*
2  * Copyright (c) 2005 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993
4  *        The Regents of the University of California.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Neither the name of the University nor the names of its contributors
15  *    may be used to endorse or promote products derived from this software
16  *    without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * @(#)uipc_socket2.c         8.1 (Berkeley) 6/10/93
31  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
32  */
33 
34 #include "opt_param.h"
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/domain.h>
38 #include <sys/file.h>         /* for maxfiles */
39 #include <sys/kernel.h>
40 #include <sys/proc.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/stat.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 
49 #include <sys/thread2.h>
50 #include <sys/msgport2.h>
51 
52 /*
53  * Routines to add and remove data from an mbuf queue.
54  *
55  * The routines sbappend() or sbappendrecord() are normally called to
56  * append new mbufs to a socket buffer.  sbappendrecord() differs from
57  * sbappend() in that data supplied is treated as the beginning of a new
58  * record.  sbappend() only begins a new record if the last mbuf in the
59  * sockbuf is marked M_EOR.
60  *
61  * To place a sender's address, optional access rights, and data in a
62  * socket receive buffer, sbappendaddr() or sbappendcontrol() should be
63  * used.   These functions also begin a new record.
64  *
65  * Reliable protocols may use the socket send buffer to hold data
66  * awaiting acknowledgement.  Data is normally copied from a socket
67  * send buffer in a protocol with m_copym for output to a peer,
68  * and then removing the data from the socket buffer with sbdrop()
69  * or sbdroprecord() when the data is acknowledged by the peer.
70  */
71 
72 /*
73  * Append mbuf chain m to the last record in the socket buffer sb.
74  * The additional space associated the mbuf chain is recorded in sb.
75  * Empty mbufs are discarded and mbufs are compacted where possible.
76  *
77  * If M_EOR is set in the first or last mbuf of the last record, the
78  * mbuf chain is appended as a new record.  M_EOR is usually just set
79  * in the last mbuf of the last record's mbuf chain (see sbcompress()),
80  * but this may be changed in the future since there is no real need
81  * to propogate the flag any more.
82  */
83 void
sbappend(struct sockbuf * sb,struct mbuf * m)84 sbappend(struct sockbuf *sb, struct mbuf *m)
85 {
86           struct mbuf *n;
87 
88           mbuftrackid(m, 16);
89 
90           if (m) {
91                     n = sb->sb_lastrecord;
92                     if (n) {
93                               if (n->m_flags & M_EOR) {
94                                         sbappendrecord(sb, m);
95                                         return;
96                               }
97                     }
98                     n = sb->sb_lastmbuf;
99                     if (n) {
100                               if (n->m_flags & M_EOR) {
101                                         sbappendrecord(sb, m);
102                                         return;
103                               }
104                     }
105                     sbcompress(sb, m, n);
106           }
107 }
108 
109 /*
110  * sbappendstream() is an optimized form of sbappend() for protocols
111  * such as TCP that only have one record in the socket buffer, are
112  * not PR_ATOMIC, nor allow MT_CONTROL data.  A protocol that uses
113  * sbappendstream() must use sbappendstream() exclusively.
114  */
115 void
sbappendstream(struct sockbuf * sb,struct mbuf * m)116 sbappendstream(struct sockbuf *sb, struct mbuf *m)
117 {
118           mbuftrackid(m, 17);
119           KKASSERT(m->m_nextpkt == NULL);
120           sbcompress(sb, m, sb->sb_lastmbuf);
121 }
122 
123 #ifdef SOCKBUF_DEBUG
124 
125 void
_sbcheck(struct sockbuf * sb)126 _sbcheck(struct sockbuf *sb)
127 {
128           struct mbuf *m;
129           struct mbuf *n = NULL;
130           u_long len = 0, mbcnt = 0;
131 
132           for (m = sb->sb_mb; m; m = n) {
133               n = m->m_nextpkt;
134               if (n == NULL && sb->sb_lastrecord != m) {
135                         kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m);
136                         panic("sbcheck1");
137               }
138               for (; m; m = m->m_next) {
139                     len += m->m_len;
140                     mbcnt += MSIZE;
141                     if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
142                               mbcnt += m->m_ext.ext_size;
143                     if (n == NULL && m->m_next == NULL) {
144                               if (sb->sb_lastmbuf != m) {
145                                         kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m);
146                                         panic("sbcheck2");
147                               }
148                     }
149               }
150           }
151           if (sb->sb_mb == NULL) {
152               if (sb->sb_lastrecord != NULL) {
153                     kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
154                               sb, sb->sb_lastrecord);
155                     panic("sbcheck3");
156               }
157               if (sb->sb_lastmbuf != NULL) {
158                     kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
159                               sb, sb->sb_lastmbuf);
160                     panic("sbcheck4");
161               }
162           }
163           if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
164                     kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
165                         sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt);
166                     panic("sbcheck5");
167           }
168 }
169 
170 #endif
171 
172 /*
173  * Same as sbappend(), except the mbuf chain begins a new record.
174  */
175 void
sbappendrecord(struct sockbuf * sb,struct mbuf * m0)176 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
177 {
178           struct mbuf *firstmbuf;
179           struct mbuf *secondmbuf;
180 
181           if (m0 == NULL)
182                     return;
183           mbuftrackid(m0, 18);
184 
185           sbcheck(sb);
186 
187           /*
188            * Break the first mbuf off from the rest of the mbuf chain.
189            */
190           firstmbuf = m0;
191           secondmbuf = m0->m_next;
192           m0->m_next = NULL;
193 
194           /*
195            * Insert the first mbuf of the m0 mbuf chain as the last record of
196            * the sockbuf.  Note this permits zero length records!  Keep the
197            * sockbuf state consistent.
198            */
199           if (sb->sb_mb == NULL)
200                     sb->sb_mb = firstmbuf;
201           else
202                     sb->sb_lastrecord->m_nextpkt = firstmbuf;
203           sb->sb_lastrecord = firstmbuf;          /* update hint for new last record */
204           sb->sb_lastmbuf = firstmbuf;  /* update hint for new last mbuf */
205 
206           /*
207            * propagate the EOR flag so sbcompress() can pick it up
208            */
209           if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) {
210                     firstmbuf->m_flags &= ~M_EOR;
211                     secondmbuf->m_flags |= M_EOR;
212           }
213 
214           /*
215            * The succeeding call to sbcompress() omits accounting for
216            * the first mbuf, so do it here.
217            */
218           sballoc(sb, firstmbuf);
219 
220           /* Compact the rest of the mbuf chain in after the first mbuf. */
221           sbcompress(sb, secondmbuf, firstmbuf);
222 }
223 
224 /*
225  * Append address and data, and optionally, control (ancillary) data
226  * to the receive queue of a socket.  If present,
227  * m0 must include a packet header with total length.
228  * Returns 0 if insufficient mbufs.
229  */
230 int
sbappendaddr(struct sockbuf * sb,const struct sockaddr * asa,struct mbuf * m0,struct mbuf * control)231 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
232                struct mbuf *control)
233 {
234           struct mbuf *m, *n;
235           int eor;
236 
237           mbuftrackid(m0, 19);
238           mbuftrackid(control, 20);
239           if (m0 && (m0->m_flags & M_PKTHDR) == 0)
240                     panic("sbappendaddr");
241           sbcheck(sb);
242 
243           for (n = control; n; n = n->m_next) {
244                     if (n->m_next == NULL)        /* keep pointer to last control buf */
245                               break;
246           }
247           if (asa->sa_len > MLEN)
248                     return (0);
249           MGET(m, M_NOWAIT, MT_SONAME);
250           if (m == NULL)
251                     return (0);
252           KKASSERT(m->m_nextpkt == NULL);
253           m->m_len = asa->sa_len;
254           bcopy(asa, mtod(m, caddr_t), asa->sa_len);
255           if (n)
256                     n->m_next = m0;               /* concatenate data to control */
257           else
258                     control = m0;
259           m->m_next = control;
260           for (n = m; n; n = n->m_next)
261                     sballoc(sb, n);
262 
263           if (sb->sb_mb == NULL)
264                     sb->sb_mb = m;
265           else
266                     sb->sb_lastrecord->m_nextpkt = m;
267           sb->sb_lastrecord = m;
268 
269           /*
270            * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
271            * so sbappend() can find it.
272            */
273           eor = m->m_flags;
274           while (m->m_next) {
275                     m->m_flags &= ~M_EOR;
276                     m = m->m_next;
277                     eor |= m->m_flags;
278           }
279           m->m_flags |= eor & M_EOR;
280           sb->sb_lastmbuf = m;
281 
282           return (1);
283 }
284 
285 /*
286  * Append control information followed by data. Both the control and data
287  * must be non-null.
288  */
289 int
sbappendcontrol(struct sockbuf * sb,struct mbuf * m0,struct mbuf * control)290 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
291 {
292           struct mbuf *n;
293           u_int length, cmbcnt, m0mbcnt;
294           int eor;
295 
296           KASSERT(control != NULL, ("sbappendcontrol"));
297           KKASSERT(control->m_nextpkt == NULL);
298           sbcheck(sb);
299 
300           mbuftrackid(m0, 21);
301           mbuftrackid(control, 22);
302 
303           length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt);
304 
305           KKASSERT(m0 != NULL);
306 
307           n->m_next = m0;                         /* concatenate data to control */
308 
309           if (sb->sb_mb == NULL)
310                     sb->sb_mb = control;
311           else
312                     sb->sb_lastrecord->m_nextpkt = control;
313           sb->sb_lastrecord = control;
314 
315           /*
316            * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
317            * so sbappend() can find it.
318            */
319           eor = m0->m_flags;
320           while (m0->m_next) {
321                     m0->m_flags &= ~M_EOR;
322                     m0 = m0->m_next;
323                     eor |= m0->m_flags;
324           }
325           m0->m_flags |= eor & M_EOR;
326           sb->sb_lastmbuf = m0;
327 
328           sb->sb_cc += length;
329           sb->sb_mbcnt += cmbcnt + m0mbcnt;
330 
331           return (1);
332 }
333 
334 /*
335  * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
336  * If tailm is null, the buffer is presumed empty.  Also, as a side-effect,
337  * increment the sockbuf counts for each mbuf in the chain.
338  */
339 void
sbcompress(struct sockbuf * sb,struct mbuf * m,struct mbuf * tailm)340 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm)
341 {
342           int eor = 0;
343           struct mbuf *free_chain = NULL;
344 
345           mbuftrackid(m, 23);
346 
347           sbcheck(sb);
348           while (m) {
349                     struct mbuf *o;
350 
351                     eor |= m->m_flags & M_EOR;
352                     /*
353                      * Disregard empty mbufs as long as we don't encounter
354                      * an end-of-record or there is a trailing mbuf of
355                      * the same type to propagate the EOR flag to.
356                      *
357                      * Defer the m_free() call because it can block and break
358                      * the atomicy of the sockbuf.
359                      */
360                     if (m->m_len == 0 &&
361                         (eor == 0 ||
362                          (((o = m->m_next) || (o = tailm)) &&
363                           o->m_type == m->m_type))) {
364                               o = m->m_next;
365                               m->m_next = free_chain;
366                               free_chain = m;
367                               m = o;
368                               continue;
369                     }
370 
371                     /*
372                      * See if we can coalesce with preceding mbuf.  Never try
373                      * to coalesce a mbuf representing an end-of-record or
374                      * a mbuf locked by userland for reading.
375                      */
376                     if (tailm && !(tailm->m_flags & (M_EOR | M_SOLOCKED)) &&
377                         M_WRITABLE(tailm) &&
378                         m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
379                         m->m_len <= M_TRAILINGSPACE(tailm) &&
380                         tailm->m_type == m->m_type) {
381                               u_long mbcnt_sz;
382 
383                               bcopy(mtod(m, caddr_t),
384                                     mtod(tailm, caddr_t) + tailm->m_len,
385                                     (unsigned)m->m_len);
386                               tailm->m_len += m->m_len;
387 
388                               sb->sb_cc += m->m_len;                  /* update sb counter */
389 
390                               /*
391                                * Fix the wrongly updated mbcnt_prealloc
392                                */
393                               mbcnt_sz = MSIZE;
394                               if (m->m_flags & M_EXT)
395                                         mbcnt_sz += m->m_ext.ext_size;
396                               atomic_subtract_long(&sb->sb_mbcnt_prealloc, mbcnt_sz);
397 
398                               o = m->m_next;
399                               m->m_next = free_chain;
400                               free_chain = m;
401                               m = o;
402                               continue;
403                     }
404 
405                     /* Insert whole mbuf. */
406                     if (tailm == NULL) {
407                               KASSERT(sb->sb_mb == NULL,
408                                         ("sbcompress: sb_mb not NULL"));
409                               sb->sb_mb = m;                /* only mbuf in sockbuf */
410                               sb->sb_lastrecord = m;        /* new last record */
411                     } else {
412                               tailm->m_next = m;  /* tack m on following tailm */
413                     }
414                     sb->sb_lastmbuf = m;          /* update last mbuf hint */
415 
416                     tailm = m;          /* just inserted mbuf becomes the new tail */
417                     m = m->m_next;                /* advance to next mbuf */
418                     tailm->m_next = NULL;         /* split inserted mbuf off from chain */
419 
420                     /* update sb counters for just added mbuf */
421                     sballoc(sb, tailm);
422 
423                     /* clear EOR on intermediate mbufs */
424                     tailm->m_flags &= ~M_EOR;
425           }
426 
427           /*
428            * Propogate EOR to the last mbuf
429            */
430           if (eor) {
431                     if (tailm)
432                               tailm->m_flags |= eor;
433                     else
434                               kprintf("semi-panic: sbcompress");
435           }
436 
437           /*
438            * Clean up any defered frees.
439            */
440           while (free_chain)
441                     free_chain = m_free(free_chain);
442 
443           sbcheck(sb);
444 }
445 
446 /*
447  * Free all mbufs in a sockbuf.
448  * Check that all resources are reclaimed.
449  */
450 void
sbflush(struct sockbuf * sb)451 sbflush(struct sockbuf *sb)
452 {
453           while (sb->sb_mbcnt) {
454                     /*
455                      * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
456                      * we would loop forever. Panic instead.
457                      */
458                     if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
459                               break;
460                     sbdrop(sb, (int)sb->sb_cc);
461           }
462           KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf),
463               ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
464               sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf));
465 }
466 
467 /*
468  * Drop data from (the front of) a sockbuf.  If the current record is
469  * exhausted this routine will move onto the next one and continue dropping
470  * data.
471  */
472 void
sbdrop(struct sockbuf * sb,int len)473 sbdrop(struct sockbuf *sb, int len)
474 {
475           struct mbuf *m;
476           struct mbuf *free_chain = NULL;
477 
478           sbcheck(sb);
479           crit_enter();
480 
481           m = sb->sb_mb;
482           while (m && len > 0) {
483                     if (m->m_len > len) {
484                               m->m_len -= len;
485                               m->m_data += len;
486                               sb->sb_cc -= len;
487                               atomic_subtract_long(&sb->sb_cc_prealloc, len);
488                               break;
489                     }
490                     len -= m->m_len;
491                     m = sbunlinkmbuf(sb, m, &free_chain);
492                     if (m == NULL && len)
493                               m = sb->sb_mb;
494           }
495 
496           /*
497            * Remove any trailing 0-length mbufs in the current record.  If
498            * the last record for which data was removed is now empty, m will be
499            * NULL.
500            */
501           while (m && m->m_len == 0) {
502                     m = sbunlinkmbuf(sb, m, &free_chain);
503           }
504           crit_exit();
505           if (free_chain)
506                     m_freem(free_chain);
507           sbcheck(sb);
508 }
509 
510 /*
511  * Drop a record off the front of a sockbuf and move the next record
512  * to the front.
513  *
514  * Must be called while holding a critical section.
515  */
516 void
sbdroprecord(struct sockbuf * sb)517 sbdroprecord(struct sockbuf *sb)
518 {
519           struct mbuf *m;
520           struct mbuf *n;
521 
522           sbcheck(sb);
523           m = sb->sb_mb;
524           if (m) {
525                     if ((sb->sb_mb = m->m_nextpkt) == NULL) {
526                               sb->sb_lastrecord = NULL;
527                               sb->sb_lastmbuf = NULL;
528                     }
529                     m->m_nextpkt = NULL;
530                     for (n = m; n; n = n->m_next)
531                               sbfree(sb, n);
532                     m_freem(m);
533                     sbcheck(sb);
534           }
535 }
536 
537 /*
538  * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
539  * Currently only the head mbuf of the sockbuf may be dropped this way.
540  *
541  * The next mbuf in the same record as the mbuf being removed is returned
542  * or NULL if the record is exhausted.  Note that other records may remain
543  * in the sockbuf when NULL is returned.
544  *
545  * Must be called while holding a critical section.
546  */
547 struct mbuf *
sbunlinkmbuf(struct sockbuf * sb,struct mbuf * m,struct mbuf ** free_chain)548 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain)
549 {
550           struct mbuf *n;
551 
552           KKASSERT(sb->sb_mb == m);
553           sbfree(sb, m);
554           n = m->m_next;
555           if (n) {
556                     sb->sb_mb = n;
557                     if (sb->sb_lastrecord == m)
558                               sb->sb_lastrecord = n;
559                     KKASSERT(sb->sb_lastmbuf != m);
560                     n->m_nextpkt = m->m_nextpkt;
561           } else {
562                     sb->sb_mb = m->m_nextpkt;
563                     if (sb->sb_lastrecord == m) {
564                               KKASSERT(sb->sb_mb == NULL);
565                               sb->sb_lastrecord = NULL;
566                     }
567                     if (sb->sb_mb == NULL)
568                               sb->sb_lastmbuf = NULL;
569           }
570           m->m_nextpkt = NULL;
571           if (free_chain) {
572                     m->m_next = *free_chain;
573                     *free_chain = m;
574           } else {
575                     m->m_next = NULL;
576           }
577           return(n);
578 }
579 
580 /*
581  * Create a "control" mbuf containing the specified data
582  * with the specified type for presentation on a socket buffer.
583  */
584 struct mbuf *
sbcreatecontrol(const void * p,size_t size,int type,int level)585 sbcreatecontrol(const void *p, size_t size, int type, int level)
586 {
587           struct cmsghdr *cp;
588           struct mbuf *m;
589 
590           if (CMSG_SPACE(size) > MCLBYTES)
591                     return (NULL);
592           m = m_getl(CMSG_SPACE(size), M_NOWAIT, MT_CONTROL, 0, NULL);
593           if (m == NULL)
594                     return (NULL);
595           m->m_len = CMSG_SPACE(size);
596           cp = mtod(m, struct cmsghdr *);
597           if (p != NULL)
598                     memcpy(CMSG_DATA(cp), p, size);
599           cp->cmsg_len = CMSG_LEN(size);
600           cp->cmsg_level = level;
601           cp->cmsg_type = type;
602           mbuftrackid(m, 24);
603           return (m);
604 }
605