1 /*        $NetBSD: ip_encap.c,v 1.78 2025/02/26 04:49:45 andvar Exp $ */
2 /*        $KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $   */
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 /*
33  * My grandfather said that there's a devil inside tunnelling technology...
34  *
35  * We have surprisingly many protocols that want packets with IP protocol
36  * #4 or #41.  Here's a list of protocols that want protocol #41:
37  *        RFC1933 configured tunnel
38  *        RFC1933 automatic tunnel
39  *        RFC2401 IPsec tunnel
40  *        RFC2473 IPv6 generic packet tunnelling
41  *        RFC2529 6over4 tunnel
42  *        RFC3056 6to4 tunnel
43  *        isatap tunnel
44  *        mobile-ip6 (uses RFC2473)
45  * Here's a list of protocol that want protocol #4:
46  *        RFC1853 IPv4-in-IPv4 tunnelling
47  *        RFC2003 IPv4 encapsulation within IPv4
48  *        RFC2344 reverse tunnelling for mobile-ip4
49  *        RFC2401 IPsec tunnel
50  * Well, what can I say.  They impose different en/decapsulation mechanism
51  * from each other, so they need separate protocol handler.  The only one
52  * we can easily determine by protocol # is IPsec, which always has
53  * AH/ESP/IPComp header right after outer IP header.
54  *
55  * So, clearly good old protosw does not work for protocol #4 and #41.
56  * The code will let you match protocol via src/dst address pair.
57  */
58 /* XXX is M_NETADDR correct? */
59 
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.78 2025/02/26 04:49:45 andvar Exp $");
62 
63 #ifdef _KERNEL_OPT
64 #include "opt_mrouting.h"
65 #include "opt_inet.h"
66 #include "opt_net_mpsafe.h"
67 #endif
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/socket.h>
72 #include <sys/socketvar.h> /* for softnet_lock */
73 #include <sys/sockio.h>
74 #include <sys/mbuf.h>
75 #include <sys/errno.h>
76 #include <sys/queue.h>
77 #include <sys/kmem.h>
78 #include <sys/mutex.h>
79 #include <sys/condvar.h>
80 #include <sys/psref.h>
81 #include <sys/pslist.h>
82 #include <sys/thmap.h>
83 
84 #include <net/if.h>
85 
86 #include <netinet/in.h>
87 #include <netinet/in_systm.h>
88 #include <netinet/ip.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip_encap.h>
91 #ifdef MROUTING
92 #include <netinet/ip_mroute.h>
93 #endif /* MROUTING */
94 
95 #ifdef INET6
96 #include <netinet/ip6.h>
97 #include <netinet6/ip6_var.h>
98 #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */
99 #include <netinet6/in6_var.h>
100 #include <netinet6/in6_pcb.h>
101 #include <netinet/icmp6.h>
102 #endif
103 
104 #ifdef NET_MPSAFE
105 #define ENCAP_MPSAFE          1
106 #endif
107 
108 enum direction { INBOUND, OUTBOUND };
109 
110 #ifdef INET
111 static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction,
112     struct psref *);
113 #endif
114 #ifdef INET6
115 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction,
116     struct psref *);
117 #endif
118 static int encap_add(struct encaptab *);
119 static int encap_remove(struct encaptab *);
120 static void encap_afcheck(int, const struct sockaddr *, const struct sockaddr *);
121 static void encap_key_init(struct encap_key *, const struct sockaddr *,
122     const struct sockaddr *);
123 static void encap_key_inc(struct encap_key *);
124 
125 /*
126  * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking
127  * encap_table. So, it cannot use pserialize_read_enter()
128  */
129 static struct {
130           struct pslist_head  list;
131           pserialize_t                  psz;
132           struct psref_class  *elem_class; /* for the element of et_list */
133 } encaptab  __cacheline_aligned = {
134           .list = PSLIST_INITIALIZER,
135 };
136 #define encap_table encaptab.list
137 
138 static struct {
139           kmutex_t  lock;
140           kcondvar_t          cv;
141           struct lwp          *busy;
142 } encap_whole __cacheline_aligned;
143 
144 static thmap_t *encap_map[2]; /* 0 for AF_INET, 1 for AF_INET6 */
145 
146 static bool encap_initialized = false;
147 /*
148  * must be done before other encap interfaces initialization.
149  */
150 void
encapinit(void)151 encapinit(void)
152 {
153 
154           if (encap_initialized)
155                     return;
156 
157           encaptab.psz = pserialize_create();
158           encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET);
159 
160           mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE);
161           cv_init(&encap_whole.cv, "ip_encap cv");
162           encap_whole.busy = NULL;
163 
164           encap_initialized = true;
165 }
166 
167 void
encap_init(void)168 encap_init(void)
169 {
170           static int initialized = 0;
171 
172           if (initialized)
173                     return;
174           initialized++;
175 #if 0
176           /*
177            * we cannot use LIST_INIT() here, since drivers may want to call
178            * encap_attach(), on driver attach.  encap_init() will be called
179            * on AF_INET{,6} initialization, which happens after driver
180            * initialization - using LIST_INIT() here can nuke encap_attach()
181            * from drivers.
182            */
183           PSLIST_INIT(&encap_table);
184 #endif
185 
186           encap_map[0] = thmap_create(0, NULL, THMAP_NOCOPY);
187 #ifdef INET6
188           encap_map[1] = thmap_create(0, NULL, THMAP_NOCOPY);
189 #endif
190 }
191 
192 #ifdef INET
193 static struct encaptab *
encap4_lookup(struct mbuf * m,int off,int proto,enum direction dir,struct psref * match_psref)194 encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir,
195     struct psref *match_psref)
196 {
197           struct ip *ip;
198           struct ip_pack4 pack;
199           struct encaptab *ep, *match;
200           int prio, matchprio;
201           int s;
202           thmap_t *emap = encap_map[0];
203           struct encap_key key;
204 
205           KASSERT(m->m_len >= sizeof(*ip));
206 
207           ip = mtod(m, struct ip *);
208 
209           memset(&pack, 0, sizeof(pack));
210           pack.p.sp_len = sizeof(pack);
211           pack.mine.sin_family = pack.yours.sin_family = AF_INET;
212           pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in);
213           if (dir == INBOUND) {
214                     pack.mine.sin_addr = ip->ip_dst;
215                     pack.yours.sin_addr = ip->ip_src;
216           } else {
217                     pack.mine.sin_addr = ip->ip_src;
218                     pack.yours.sin_addr = ip->ip_dst;
219           }
220 
221           match = NULL;
222           matchprio = 0;
223 
224           s = pserialize_read_enter();
225 
226           encap_key_init(&key, sintosa(&pack.mine), sintosa(&pack.yours));
227           while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
228                     struct psref elem_psref;
229 
230                     KASSERT(ep->af == AF_INET);
231 
232                     if (ep->proto >= 0 && ep->proto != proto) {
233                               encap_key_inc(&key);
234                               continue;
235                     }
236 
237                     psref_acquire(&elem_psref, &ep->psref,
238                         encaptab.elem_class);
239                     if (ep->func) {
240                               pserialize_read_exit(s);
241                               prio = (*ep->func)(m, off, proto, ep->arg);
242                               s = pserialize_read_enter();
243                     } else {
244                               prio = pack.mine.sin_len + pack.yours.sin_len;
245                     }
246 
247                     if (prio <= 0) {
248                               psref_release(&elem_psref, &ep->psref,
249                                   encaptab.elem_class);
250                               encap_key_inc(&key);
251                               continue;
252                     }
253                     if (prio > matchprio) {
254                               /* release last matched ep */
255                               if (match != NULL)
256                                         psref_release(match_psref, &match->psref,
257                                             encaptab.elem_class);
258 
259                               psref_copy(match_psref, &elem_psref,
260                                   encaptab.elem_class);
261                               matchprio = prio;
262                               match = ep;
263                     }
264 
265                     psref_release(&elem_psref, &ep->psref,
266                         encaptab.elem_class);
267                     encap_key_inc(&key);
268           }
269 
270           PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
271                     struct psref elem_psref;
272 
273                     if (ep->af != AF_INET)
274                               continue;
275                     if (ep->proto >= 0 && ep->proto != proto)
276                               continue;
277 
278                     psref_acquire(&elem_psref, &ep->psref,
279                         encaptab.elem_class);
280                     pserialize_read_exit(s);
281                     /* ep->func is sleepable. e.g. rtalloc1 */
282                     prio = (*ep->func)(m, off, proto, ep->arg);
283                     s = pserialize_read_enter();
284 
285                     /*
286                      * We prioritize the matches by using bit length of the
287                      * matches.  user-supplied matching function
288                      * should return the bit length of the matches (for example,
289                      * if both src/dst are matched for IPv4, 64 should be returned).
290                      * 0 or negative return value means "it did not match".
291                      *
292                      * We need to loop through all the possible candidates
293                      * to get the best match - the search takes O(n) for
294                      * n attachments (i.e. interfaces).
295                      */
296                     if (prio <= 0) {
297                               psref_release(&elem_psref, &ep->psref,
298                                   encaptab.elem_class);
299                               continue;
300                     }
301                     if (prio > matchprio) {
302                               /* release last matched ep */
303                               if (match != NULL)
304                                         psref_release(match_psref, &match->psref,
305                                             encaptab.elem_class);
306 
307                               psref_copy(match_psref, &elem_psref,
308                                   encaptab.elem_class);
309                               matchprio = prio;
310                               match = ep;
311                     }
312                     KASSERTMSG((match == NULL) || psref_held(&match->psref,
313                               encaptab.elem_class),
314                         "current match = %p, but not hold its psref", match);
315 
316                     psref_release(&elem_psref, &ep->psref,
317                         encaptab.elem_class);
318           }
319           pserialize_read_exit(s);
320 
321           return match;
322 }
323 
324 void
encap4_input(struct mbuf * m,int off,int proto)325 encap4_input(struct mbuf *m, int off, int proto)
326 {
327           const struct encapsw *esw;
328           struct encaptab *match;
329           struct psref match_psref;
330 
331           match = encap4_lookup(m, off, proto, INBOUND, &match_psref);
332           if (match) {
333                     /* found a match, "match" has the best one */
334                     esw = match->esw;
335                     if (esw && esw->encapsw4.pr_input) {
336                               (*esw->encapsw4.pr_input)(m, off, proto, match->arg);
337                               psref_release(&match_psref, &match->psref,
338                                   encaptab.elem_class);
339                     } else {
340                               psref_release(&match_psref, &match->psref,
341                                   encaptab.elem_class);
342                               m_freem(m);
343                     }
344                     return;
345           }
346 
347           /* last resort: inject to raw socket */
348           SOFTNET_LOCK_IF_NET_MPSAFE();
349           rip_input(m, off, proto);
350           SOFTNET_UNLOCK_IF_NET_MPSAFE();
351 }
352 #endif
353 
354 #ifdef INET6
355 static struct encaptab *
encap6_lookup(struct mbuf * m,int off,int proto,enum direction dir,struct psref * match_psref)356 encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir,
357     struct psref *match_psref)
358 {
359           struct ip6_hdr *ip6;
360           struct ip_pack6 pack;
361           int prio, matchprio;
362           int s;
363           struct encaptab *ep, *match;
364           thmap_t *emap = encap_map[1];
365           struct encap_key key;
366 
367           KASSERT(m->m_len >= sizeof(*ip6));
368 
369           ip6 = mtod(m, struct ip6_hdr *);
370 
371           memset(&pack, 0, sizeof(pack));
372           pack.p.sp_len = sizeof(pack);
373           pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6;
374           pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6);
375           if (dir == INBOUND) {
376                     pack.mine.sin6_addr = ip6->ip6_dst;
377                     pack.yours.sin6_addr = ip6->ip6_src;
378           } else {
379                     pack.mine.sin6_addr = ip6->ip6_src;
380                     pack.yours.sin6_addr = ip6->ip6_dst;
381           }
382 
383           match = NULL;
384           matchprio = 0;
385 
386           s = pserialize_read_enter();
387 
388           encap_key_init(&key, sin6tosa(&pack.mine), sin6tosa(&pack.yours));
389           while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
390                     struct psref elem_psref;
391 
392                     KASSERT(ep->af == AF_INET6);
393 
394                     if (ep->proto >= 0 && ep->proto != proto) {
395                               encap_key_inc(&key);
396                               continue;
397                     }
398 
399                     psref_acquire(&elem_psref, &ep->psref,
400                         encaptab.elem_class);
401                     if (ep->func) {
402                               pserialize_read_exit(s);
403                               prio = (*ep->func)(m, off, proto, ep->arg);
404                               s = pserialize_read_enter();
405                     } else {
406                               prio = pack.mine.sin6_len + pack.yours.sin6_len;
407                     }
408 
409                     if (prio <= 0) {
410                               psref_release(&elem_psref, &ep->psref,
411                                   encaptab.elem_class);
412                               encap_key_inc(&key);
413                               continue;
414                     }
415                     if (prio > matchprio) {
416                               /* release last matched ep */
417                               if (match != NULL)
418                                         psref_release(match_psref, &match->psref,
419                                             encaptab.elem_class);
420 
421                               psref_copy(match_psref, &elem_psref,
422                                   encaptab.elem_class);
423                               matchprio = prio;
424                               match = ep;
425                     }
426                     psref_release(&elem_psref, &ep->psref,
427                         encaptab.elem_class);
428                     encap_key_inc(&key);
429           }
430 
431           PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
432                     struct psref elem_psref;
433 
434                     if (ep->af != AF_INET6)
435                               continue;
436                     if (ep->proto >= 0 && ep->proto != proto)
437                               continue;
438 
439                     psref_acquire(&elem_psref, &ep->psref,
440                         encaptab.elem_class);
441 
442                     pserialize_read_exit(s);
443                     /* ep->func is sleepable. e.g. rtalloc1 */
444                     prio = (*ep->func)(m, off, proto, ep->arg);
445                     s = pserialize_read_enter();
446 
447                     /* see encap4_lookup() for issues here */
448                     if (prio <= 0) {
449                               psref_release(&elem_psref, &ep->psref,
450                                   encaptab.elem_class);
451                               continue;
452                     }
453                     if (prio > matchprio) {
454                               /* release last matched ep */
455                               if (match != NULL)
456                                         psref_release(match_psref, &match->psref,
457                                             encaptab.elem_class);
458 
459                               psref_copy(match_psref, &elem_psref,
460                                   encaptab.elem_class);
461                               matchprio = prio;
462                               match = ep;
463                     }
464                     KASSERTMSG((match == NULL) || psref_held(&match->psref,
465                               encaptab.elem_class),
466                         "current match = %p, but not hold its psref", match);
467 
468                     psref_release(&elem_psref, &ep->psref,
469                         encaptab.elem_class);
470           }
471           pserialize_read_exit(s);
472 
473           return match;
474 }
475 
476 int
encap6_input(struct mbuf ** mp,int * offp,int proto)477 encap6_input(struct mbuf **mp, int *offp, int proto)
478 {
479           struct mbuf *m = *mp;
480           const struct encapsw *esw;
481           struct encaptab *match;
482           struct psref match_psref;
483           int rv;
484 
485           match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref);
486 
487           if (match) {
488                     /* found a match */
489                     esw = match->esw;
490                     if (esw && esw->encapsw6.pr_input) {
491                               int ret;
492                               ret = (*esw->encapsw6.pr_input)(mp, offp, proto,
493                                   match->arg);
494                               psref_release(&match_psref, &match->psref,
495                                   encaptab.elem_class);
496                               return ret;
497                     } else {
498                               psref_release(&match_psref, &match->psref,
499                                   encaptab.elem_class);
500                               m_freem(m);
501                               return IPPROTO_DONE;
502                     }
503           }
504 
505           /* last resort: inject to raw socket */
506           SOFTNET_LOCK_IF_NET_MPSAFE();
507           rv = rip6_input(mp, offp, proto);
508           SOFTNET_UNLOCK_IF_NET_MPSAFE();
509           return rv;
510 }
511 #endif
512 
513 static int
encap_add(struct encaptab * ep)514 encap_add(struct encaptab *ep)
515 {
516 
517           KASSERT(encap_lock_held());
518 
519           PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain);
520 
521           return 0;
522 }
523 
524 static int
encap_remove(struct encaptab * ep)525 encap_remove(struct encaptab *ep)
526 {
527           int error = 0;
528 
529           KASSERT(encap_lock_held());
530 
531           PSLIST_WRITER_REMOVE(ep, chain);
532 
533           return error;
534 }
535 
536 static void
encap_afcheck(int af,const struct sockaddr * sp,const struct sockaddr * dp)537 encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp)
538 {
539 
540           KASSERT(sp != NULL && dp != NULL);
541           KASSERT(sp->sa_len == dp->sa_len);
542           KASSERT(af == sp->sa_family && af == dp->sa_family);
543 
544           socklen_t len __diagused = sockaddr_getsize_by_family(af);
545           KASSERT(len != 0 && len == sp->sa_len && len == dp->sa_len);
546 }
547 
548 const struct encaptab *
encap_attach_func(int af,int proto,encap_priofunc_t * func,const struct encapsw * esw,void * arg)549 encap_attach_func(int af, int proto,
550     encap_priofunc_t *func,
551     const struct encapsw *esw, void *arg)
552 {
553           struct encaptab *ep;
554           int error;
555 #ifndef ENCAP_MPSAFE
556           int s;
557 
558           s = splsoftnet();
559 #endif
560 
561           ASSERT_SLEEPABLE();
562 
563           /* sanity check on args */
564           KASSERT(func != NULL);
565           KASSERT(af == AF_INET
566 #ifdef INET6
567               || af == AF_INET6
568 #endif
569           );
570 
571           ep = kmem_alloc(sizeof(*ep), KM_SLEEP);
572           memset(ep, 0, sizeof(*ep));
573 
574           ep->af = af;
575           ep->proto = proto;
576           ep->func = func;
577           ep->esw = esw;
578           ep->arg = arg;
579           psref_target_init(&ep->psref, encaptab.elem_class);
580 
581           error = encap_add(ep);
582           if (error)
583                     goto gc;
584 
585           error = 0;
586 #ifndef ENCAP_MPSAFE
587           splx(s);
588 #endif
589           return ep;
590 
591 gc:
592           kmem_free(ep, sizeof(*ep));
593 #ifndef ENCAP_MPSAFE
594           splx(s);
595 #endif
596           return NULL;
597 }
598 
599 static void
encap_key_init(struct encap_key * key,const struct sockaddr * local,const struct sockaddr * remote)600 encap_key_init(struct encap_key *key,
601     const struct sockaddr *local, const struct sockaddr *remote)
602 {
603 
604           memset(key, 0, sizeof(*key));
605 
606           sockaddr_copy(&key->local_sa, sizeof(key->local_u), local);
607           sockaddr_copy(&key->remote_sa, sizeof(key->remote_u), remote);
608 }
609 
610 static void
encap_key_inc(struct encap_key * key)611 encap_key_inc(struct encap_key *key)
612 {
613 
614           (key->seq)++;
615 }
616 
617 static void
encap_key_dec(struct encap_key * key)618 encap_key_dec(struct encap_key *key)
619 {
620 
621           (key->seq)--;
622 }
623 
624 static void
encap_key_copy(struct encap_key * dst,const struct encap_key * src)625 encap_key_copy(struct encap_key *dst, const struct encap_key *src)
626 {
627 
628           memset(dst, 0, sizeof(*dst));
629           *dst = *src;
630 }
631 
632 /*
633  * src is always my side, and dst is always remote side.
634  * Return value will be necessary as input (cookie) for encap_detach().
635  */
636 const struct encaptab *
encap_attach_addr(int af,int proto,const struct sockaddr * src,const struct sockaddr * dst,encap_priofunc_t * func,const struct encapsw * esw,void * arg)637 encap_attach_addr(int af, int proto,
638     const struct sockaddr *src, const struct sockaddr *dst,
639     encap_priofunc_t *func,
640     const struct encapsw *esw, void *arg)
641 {
642           struct encaptab *ep;
643           size_t l;
644           thmap_t *emap;
645           void *retep;
646           struct ip_pack4 *pack4;
647 #ifdef INET6
648           struct ip_pack6 *pack6;
649 #endif
650 
651           ASSERT_SLEEPABLE();
652 
653           encap_afcheck(af, src, dst);
654 
655           switch (af) {
656           case AF_INET:
657                     l = sizeof(*pack4);
658                     emap = encap_map[0];
659                     break;
660 #ifdef INET6
661           case AF_INET6:
662                     l = sizeof(*pack6);
663                     emap = encap_map[1];
664                     break;
665 #endif
666           default:
667                     return NULL;
668           }
669 
670           ep = kmem_zalloc(sizeof(*ep), KM_SLEEP);
671           ep->addrpack = kmem_zalloc(l, KM_SLEEP);
672           ep->addrpack->sa_len = l & 0xff;
673           ep->af = af;
674           ep->proto = proto;
675           ep->flag = IP_ENCAP_ADDR_ENABLE;
676           switch (af) {
677           case AF_INET:
678                     pack4 = (struct ip_pack4 *)ep->addrpack;
679                     ep->src = (struct sockaddr *)&pack4->mine;
680                     ep->dst = (struct sockaddr *)&pack4->yours;
681                     break;
682 #ifdef INET6
683           case AF_INET6:
684                     pack6 = (struct ip_pack6 *)ep->addrpack;
685                     ep->src = (struct sockaddr *)&pack6->mine;
686                     ep->dst = (struct sockaddr *)&pack6->yours;
687                     break;
688 #endif
689           }
690           memcpy(ep->src, src, src->sa_len);
691           memcpy(ep->dst, dst, dst->sa_len);
692           ep->esw = esw;
693           ep->arg = arg;
694           ep->func = func;
695           psref_target_init(&ep->psref, encaptab.elem_class);
696 
697           encap_key_init(&ep->key, src, dst);
698           while ((retep = thmap_put(emap, &ep->key, sizeof(ep->key), ep)) != ep)
699                     encap_key_inc(&ep->key);
700           return ep;
701 }
702 
703 
704 /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */
705 
706 #ifdef INET6
707 void *
encap6_ctlinput(int cmd,const struct sockaddr * sa,void * d0)708 encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0)
709 {
710           void *d = d0;
711           struct ip6_hdr *ip6;
712           struct mbuf *m;
713           int off;
714           struct ip6ctlparam *ip6cp = NULL;
715           int nxt;
716           int s;
717           struct encaptab *ep;
718           const struct encapsw *esw;
719 
720           if (sa->sa_family != AF_INET6 ||
721               sa->sa_len != sizeof(struct sockaddr_in6))
722                     return NULL;
723 
724           if ((unsigned)cmd >= PRC_NCMDS)
725                     return NULL;
726           if (cmd == PRC_HOSTDEAD)
727                     d = NULL;
728           else if (cmd == PRC_MSGSIZE)
729                     ; /* special code is present, see below */
730           else if (inet6ctlerrmap[cmd] == 0)
731                     return NULL;
732 
733           /* if the parameter is from icmp6, decode it. */
734           if (d != NULL) {
735                     ip6cp = (struct ip6ctlparam *)d;
736                     m = ip6cp->ip6c_m;
737                     ip6 = ip6cp->ip6c_ip6;
738                     off = ip6cp->ip6c_off;
739                     nxt = ip6cp->ip6c_nxt;
740 
741                     if (ip6 && cmd == PRC_MSGSIZE) {
742                               int valid = 0;
743                               struct encaptab *match;
744                               struct psref elem_psref;
745 
746                               /*
747                               * Check to see if we have a valid encap configuration.
748                               */
749                               match = encap6_lookup(m, off, nxt, OUTBOUND,
750                                   &elem_psref);
751                               if (match) {
752                                         valid++;
753                                         psref_release(&elem_psref, &match->psref,
754                                             encaptab.elem_class);
755                               }
756 
757                               /*
758                               * Depending on the value of "valid" and routing table
759                               * size (mtudisc_{hi,lo}wat), we will:
760                               * - recalculate the new MTU and create the
761                               *   corresponding routing entry, or
762                               * - ignore the MTU change notification.
763                               */
764                               icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
765                     }
766           } else {
767                     m = NULL;
768                     ip6 = NULL;
769                     nxt = -1;
770           }
771 
772           /* inform all listeners */
773 
774           s = pserialize_read_enter();
775           PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
776                     struct psref elem_psref;
777 
778                     if (ep->af != AF_INET6)
779                               continue;
780                     if (ep->proto >= 0 && ep->proto != nxt)
781                               continue;
782 
783                     /* should optimize by looking at address pairs */
784 
785                     /* XXX need to pass ep->arg or ep itself to listeners */
786                     psref_acquire(&elem_psref, &ep->psref,
787                         encaptab.elem_class);
788                     esw = ep->esw;
789                     if (esw && esw->encapsw6.pr_ctlinput) {
790                               pserialize_read_exit(s);
791                               /* pr_ctlinput is sleepable. e.g. rtcache_free */
792                               (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg);
793                               s = pserialize_read_enter();
794                     }
795                     psref_release(&elem_psref, &ep->psref,
796                         encaptab.elem_class);
797           }
798           pserialize_read_exit(s);
799 
800           rip6_ctlinput(cmd, sa, d0);
801           return NULL;
802 }
803 #endif
804 
805 static int
encap_detach_addr(const struct encaptab * ep)806 encap_detach_addr(const struct encaptab *ep)
807 {
808           thmap_t *emap;
809           struct encaptab *retep;
810           struct encaptab *target;
811           void *thgc;
812           struct encap_key key;
813 
814           KASSERT(encap_lock_held());
815           KASSERT(ep->flag & IP_ENCAP_ADDR_ENABLE);
816 
817           switch (ep->af) {
818           case AF_INET:
819                     emap = encap_map[0];
820                     break;
821 #ifdef INET6
822           case AF_INET6:
823                     emap = encap_map[1];
824                     break;
825 #endif
826           default:
827                     return EINVAL;
828           }
829 
830           retep = thmap_del(emap, &ep->key, sizeof(ep->key));
831           if (retep != ep) {
832                     return ENOENT;
833           }
834           target = retep;
835 
836           /*
837            * To keep continuity, decrement seq after detached encaptab.
838            */
839           encap_key_copy(&key, &ep->key);
840           encap_key_inc(&key);
841           while ((retep = thmap_del(emap, &key, sizeof(key))) != NULL) {
842                     void *pp;
843 
844                     encap_key_dec(&retep->key);
845                     pp = thmap_put(emap, &retep->key, sizeof(retep->key), retep);
846                     KASSERT(retep == pp);
847 
848                     encap_key_inc(&key);
849           }
850 
851           thgc = thmap_stage_gc(emap);
852           pserialize_perform(encaptab.psz);
853           thmap_gc(emap, thgc);
854           psref_target_destroy(&target->psref, encaptab.elem_class);
855           kmem_free(target->addrpack, target->addrpack->sa_len);
856           kmem_free(target, sizeof(*target));
857 
858           return 0;
859 }
860 
861 int
encap_detach(const struct encaptab * cookie)862 encap_detach(const struct encaptab *cookie)
863 {
864           const struct encaptab *ep = cookie;
865           struct encaptab *p;
866           int error;
867 
868           KASSERT(encap_lock_held());
869 
870           if (ep->flag & IP_ENCAP_ADDR_ENABLE)
871                     return encap_detach_addr(ep);
872 
873           PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) {
874                     if (p == ep) {
875                               error = encap_remove(p);
876                               if (error)
877                                         return error;
878                               else
879                                         break;
880                     }
881           }
882           if (p == NULL)
883                     return ENOENT;
884 
885           pserialize_perform(encaptab.psz);
886           psref_target_destroy(&p->psref,
887               encaptab.elem_class);
888           kmem_free(p, sizeof(*p));
889 
890           return 0;
891 }
892 
893 int
encap_lock_enter(void)894 encap_lock_enter(void)
895 {
896           int error;
897 
898           mutex_enter(&encap_whole.lock);
899           while (encap_whole.busy != NULL) {
900                     error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock);
901                     if (error) {
902                               mutex_exit(&encap_whole.lock);
903                               return error;
904                     }
905           }
906           KASSERT(encap_whole.busy == NULL);
907           encap_whole.busy = curlwp;
908           mutex_exit(&encap_whole.lock);
909 
910           return 0;
911 }
912 
913 void
encap_lock_exit(void)914 encap_lock_exit(void)
915 {
916 
917           mutex_enter(&encap_whole.lock);
918           KASSERT(encap_whole.busy == curlwp);
919           encap_whole.busy = NULL;
920           cv_broadcast(&encap_whole.cv);
921           mutex_exit(&encap_whole.lock);
922 }
923 
924 bool
encap_lock_held(void)925 encap_lock_held(void)
926 {
927 
928           return (encap_whole.busy == curlwp);
929 }
930