1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_route.h"
32 #include <sys/types.h>
33 #include <sys/ck.h>
34 #include <sys/epoch.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/rmlock.h>
38 #include <sys/socket.h>
39
40 #include <net/if.h>
41 #include <net/route.h>
42 #include <net/route/nhop.h>
43 #include <net/route/nhop_utils.h>
44
45 #include <net/route/route_ctl.h>
46 #include <net/route/route_var.h>
47 #include <netinet6/scope6_var.h>
48 #include <netlink/netlink.h>
49 #include <netlink/netlink_ctl.h>
50 #include <netlink/netlink_route.h>
51 #include <netlink/route/route_var.h>
52
53 #define DEBUG_MOD_NAME nl_nhop
54 #define DEBUG_MAX_LEVEL LOG_DEBUG3
55 #include <netlink/netlink_debug.h>
56 _DECLARE_DEBUG(LOG_DEBUG);
57
58 /*
59 * This file contains the logic to maintain kernel nexthops and
60 * nexhop groups based om the data provided by the user.
61 *
62 * Kernel stores (nearly) all of the routing data in the nexthops,
63 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
64 *
65 * Netlink API provides higher-level abstraction for the user. Each
66 * user-created nexthop may map to multiple kernel nexthops.
67 *
68 * The following variations require separate kernel nexthop to be
69 * created:
70 * * prefix flags (NHF_HOST, NHF_DEFAULT)
71 * * using IPv6 gateway for IPv4 routes
72 * * different fibnum
73 *
74 * These kernel nexthops have the lifetime bound to the lifetime of
75 * the user_nhop object. They are not collected until user requests
76 * to delete the created user_nhop.
77 *
78 */
79 struct user_nhop {
80 uint32_t un_idx; /* Userland-provided index */
81 uint32_t un_fibfam; /* fibnum+af(as highest byte) */
82 uint8_t un_protocol; /* protocol that install the record */
83 struct nhop_object *un_nhop; /* "production" nexthop */
84 struct nhop_object *un_nhop_src; /* nexthop to copy from */
85 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
86 uint32_t un_nhgrp_count; /* number of nexthops */
87 struct user_nhop *un_next; /* next item in hash chain */
88 struct user_nhop *un_nextchild; /* master -> children */
89 struct epoch_context un_epoch_ctx; /* epoch ctl helper */
90 };
91
92 /* produce hash value for an object */
93 #define unhop_hash_obj(_obj) (hash_unhop(_obj))
94 /* compare two objects */
95 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
96 /* next object accessor */
97 #define unhop_next(_obj) (_obj)->un_next
98
99 CHT_SLIST_DEFINE(unhop, struct user_nhop);
100
101 struct unhop_ctl {
102 struct unhop_head un_head;
103 struct rmlock un_lock;
104 };
105 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
106 #define UN_TRACKER struct rm_priotracker un_tracker
107 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
108 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
109
110 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
111 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
112
113 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
114 #define V_un_ctl VNET(un_ctl)
115
116 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
117 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
118 static unsigned int hash_unhop(const struct user_nhop *obj);
119
120 static void destroy_unhop(struct user_nhop *unhop);
121 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
122 uint32_t fibnum, int family, int nh_flags);
123
124 static int
cmp_unhop(const struct user_nhop * a,const struct user_nhop * b)125 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
126 {
127 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
128 }
129
130 /*
131 * Hash callback: calculate hash of an object
132 */
133 static unsigned int
hash_unhop(const struct user_nhop * obj)134 hash_unhop(const struct user_nhop *obj)
135 {
136 return (obj->un_idx ^ obj->un_fibfam);
137 }
138
139 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
140
141 /*
142 * Factory interface for creating matching kernel nexthops/nexthop groups
143 *
144 * @uidx: userland nexhop index used to create the nexthop
145 * @fibnum: fibnum nexthop will be used in
146 * @family: upper family nexthop will be used in
147 * @nh_flags: desired nexthop prefix flags
148 * @perror: pointer to store error to
149 *
150 * Returns referenced nexthop linked to @fibnum/@family rib on success.
151 */
152 struct nhop_object *
nl_find_nhop(uint32_t fibnum,int family,uint32_t uidx,int nh_flags,int * perror)153 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
154 int nh_flags, int *perror)
155 {
156 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
157 UN_TRACKER;
158
159 if (__predict_false(ctl == NULL))
160 return (NULL);
161
162 struct user_nhop key= {
163 .un_idx = uidx,
164 .un_fibfam = fibnum | ((uint32_t)family) << 24,
165 };
166 struct user_nhop *unhop;
167
168 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
169
170 if (__predict_false(family == 0))
171 return (NULL);
172
173 UN_RLOCK(ctl);
174 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
175 if (unhop != NULL) {
176 struct nhop_object *nh = unhop->un_nhop;
177 UN_RLOCK(ctl);
178 *perror = 0;
179 nhop_ref_any(nh);
180 return (nh);
181 }
182
183 /*
184 * Exact nexthop not found. Search for template nexthop to clone from.
185 */
186 key.un_fibfam = 0;
187 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
188 if (unhop == NULL) {
189 UN_RUNLOCK(ctl);
190 *perror = ESRCH;
191 return (NULL);
192 }
193
194 UN_RUNLOCK(ctl);
195
196 /* Create entry to insert first */
197 struct user_nhop *un_new, *un_tmp;
198 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
199 if (un_new == NULL) {
200 *perror = ENOMEM;
201 return (NULL);
202 }
203 un_new->un_idx = uidx;
204 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
205
206 /* Relying on epoch to protect unhop here */
207 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
208 if (un_new->un_nhop == NULL) {
209 free(un_new, M_NETLINK);
210 *perror = ENOMEM;
211 return (NULL);
212 }
213
214 /* Insert back and report */
215 UN_WLOCK(ctl);
216
217 /* First, find template record once again */
218 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
219 if (unhop == NULL) {
220 /* Someone deleted the nexthop during the call */
221 UN_WUNLOCK(ctl);
222 *perror = ESRCH;
223 destroy_unhop(un_new);
224 return (NULL);
225 }
226
227 /* Second, check the direct match */
228 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
229 struct nhop_object *nh;
230 if (un_tmp != NULL) {
231 /* Another thread already created the desired nextop, use it */
232 nh = un_tmp->un_nhop;
233 } else {
234 /* Finally, insert the new nexthop and link it to the primary */
235 nh = un_new->un_nhop;
236 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
237 un_new->un_nextchild = unhop->un_nextchild;
238 unhop->un_nextchild = un_new;
239 un_new = NULL;
240 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
241 }
242
243 UN_WUNLOCK(ctl);
244
245 if (un_new != NULL)
246 destroy_unhop(un_new);
247
248 *perror = 0;
249 nhop_ref_any(nh);
250 return (nh);
251 }
252
253 static struct user_nhop *
nl_find_base_unhop(struct unhop_ctl * ctl,uint32_t uidx)254 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
255 {
256 struct user_nhop key= { .un_idx = uidx };
257 struct user_nhop *unhop = NULL;
258 UN_TRACKER;
259
260 UN_RLOCK(ctl);
261 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
262 UN_RUNLOCK(ctl);
263
264 return (unhop);
265 }
266
267 #define MAX_STACK_NHOPS 4
268 static struct nhop_object *
clone_unhop(const struct user_nhop * unhop,uint32_t fibnum,int family,int nh_flags)269 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
270 {
271 #ifdef ROUTE_MPATH
272 const struct weightened_nhop *wn;
273 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
274 uint32_t num_nhops;
275 #endif
276 struct nhop_object *nh = NULL;
277 int error;
278
279 if (unhop->un_nhop_src != NULL) {
280 IF_DEBUG_LEVEL(LOG_DEBUG2) {
281 char nhbuf[NHOP_PRINT_BUFSIZE];
282 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
283 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
284 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
285 family, nh_flags);
286 }
287 struct nhop_object *nh;
288 nh = nhop_alloc(fibnum, AF_UNSPEC);
289 if (nh == NULL)
290 return (NULL);
291 nhop_copy(nh, unhop->un_nhop_src);
292 /* Check that nexthop gateway is compatible with the new family */
293 if (!nhop_set_upper_family(nh, family)) {
294 nhop_free(nh);
295 return (NULL);
296 }
297 nhop_set_uidx(nh, unhop->un_idx);
298 nhop_set_pxtype_flag(nh, nh_flags);
299 return (nhop_get_nhop(nh, &error));
300 }
301 #ifdef ROUTE_MPATH
302 wn = unhop->un_nhgrp_src;
303 num_nhops = unhop->un_nhgrp_count;
304
305 if (num_nhops > MAX_STACK_NHOPS) {
306 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
307 if (wn_new == NULL)
308 return (NULL);
309 } else
310 wn_new = wn_base;
311
312 for (int i = 0; i < num_nhops; i++) {
313 uint32_t uidx = nhop_get_uidx(wn[i].nh);
314 MPASS(uidx != 0);
315 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
316 if (error != 0)
317 break;
318 wn_new[i].weight = wn[i].weight;
319 }
320
321 if (error == 0) {
322 struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
323 struct nhgrp_object *nhg;
324
325 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
326 nh = (struct nhop_object *)nhg;
327 }
328
329 if (wn_new != wn_base)
330 free(wn_new, M_TEMP);
331 #endif
332 return (nh);
333 }
334
335 static void
destroy_unhop(struct user_nhop * unhop)336 destroy_unhop(struct user_nhop *unhop)
337 {
338 if (unhop->un_nhop != NULL)
339 nhop_free_any(unhop->un_nhop);
340 if (unhop->un_nhop_src != NULL)
341 nhop_free_any(unhop->un_nhop_src);
342 free(unhop, M_NETLINK);
343 }
344
345 static void
destroy_unhop_epoch(epoch_context_t ctx)346 destroy_unhop_epoch(epoch_context_t ctx)
347 {
348 struct user_nhop *unhop;
349
350 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
351
352 destroy_unhop(unhop);
353 }
354
355 static uint32_t
find_spare_uidx(struct unhop_ctl * ctl)356 find_spare_uidx(struct unhop_ctl *ctl)
357 {
358 struct user_nhop *unhop, key = {};
359 uint32_t uidx = 0;
360 UN_TRACKER;
361
362 UN_RLOCK(ctl);
363 /* This should return spare uid with 75% of 65k used in ~99/100 cases */
364 for (int i = 0; i < 16; i++) {
365 key.un_idx = (arc4random() % 65536) + 65536 * 4;
366 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
367 if (unhop == NULL) {
368 uidx = key.un_idx;
369 break;
370 }
371 }
372 UN_RUNLOCK(ctl);
373
374 return (uidx);
375 }
376
377
378 /*
379 * Actual netlink code
380 */
381 struct netlink_walkargs {
382 struct nl_writer *nw;
383 struct nlmsghdr hdr;
384 struct nlpcb *so;
385 int family;
386 int error;
387 int count;
388 int dumped;
389 };
390 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
391
392 static bool
dump_nhgrp(const struct user_nhop * unhop,struct nlmsghdr * hdr,struct nl_writer * nw)393 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
394 struct nl_writer *nw)
395 {
396
397 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
398 goto enomem;
399
400 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
401 nhm->nh_family = AF_UNSPEC;
402 nhm->nh_scope = 0;
403 nhm->nh_protocol = unhop->un_protocol;
404 nhm->nh_flags = 0;
405
406 nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
407 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
408
409 struct weightened_nhop *wn = unhop->un_nhgrp_src;
410 uint32_t num_nhops = unhop->un_nhgrp_count;
411 /* TODO: a better API? */
412 int nla_len = sizeof(struct nlattr);
413 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
414 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
415 if (nla == NULL)
416 goto enomem;
417 nla->nla_type = NHA_GROUP;
418 nla->nla_len = nla_len;
419 for (int i = 0; i < num_nhops; i++) {
420 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
421 grp->id = nhop_get_uidx(wn[i].nh);
422 grp->weight = wn[i].weight;
423 grp->resvd1 = 0;
424 grp->resvd2 = 0;
425 }
426
427 if (nlmsg_end(nw))
428 return (true);
429 enomem:
430 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
431 nlmsg_abort(nw);
432 return (false);
433 }
434
435 static bool
dump_nhop(const struct user_nhop * unhop,struct nlmsghdr * hdr,struct nl_writer * nw)436 dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
437 struct nl_writer *nw)
438 {
439 struct nhop_object *nh = unhop->un_nhop_src;
440
441 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
442 goto enomem;
443
444 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
445 ENOMEM_IF_NULL(nhm);
446 nhm->nh_family = nhop_get_neigh_family(nh);
447 nhm->nh_scope = 0; // XXX: what's that?
448 nhm->nh_protocol = unhop->un_protocol;
449 nhm->nh_flags = 0;
450
451 nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
452 if (nh->nh_flags & NHF_BLACKHOLE) {
453 nlattr_add_flag(nw, NHA_BLACKHOLE);
454 goto done;
455 }
456 nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index);
457
458 switch (nh->gw_sa.sa_family) {
459 #ifdef INET
460 case AF_INET:
461 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
462 break;
463 #endif
464 #ifdef INET6
465 case AF_INET6:
466 {
467 struct in6_addr addr = nh->gw6_sa.sin6_addr;
468 in6_clearscope(&addr);
469 nlattr_add(nw, NHA_GATEWAY, 16, &addr);
470 break;
471 }
472 #endif
473 }
474
475 done:
476 if (nlmsg_end(nw))
477 return (true);
478 enomem:
479 nlmsg_abort(nw);
480 return (false);
481 }
482
483 static void
dump_unhop(const struct user_nhop * unhop,struct nlmsghdr * hdr,struct nl_writer * nw)484 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
485 struct nl_writer *nw)
486 {
487 if (unhop->un_nhop_src != NULL)
488 dump_nhop(unhop, hdr, nw);
489 else
490 dump_nhgrp(unhop, hdr, nw);
491 }
492
493 static int
delete_unhop(struct unhop_ctl * ctl,struct nlmsghdr * hdr,uint32_t uidx)494 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
495 {
496 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
497
498 struct user_nhop key = { .un_idx = uidx };
499
500 UN_WLOCK(ctl);
501
502 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
503
504 if (unhop_base != NULL) {
505 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
506 IF_DEBUG_LEVEL(LOG_DEBUG2) {
507 char nhbuf[NHOP_PRINT_BUFSIZE];
508 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
509 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
510 "removed base nhop %u: %s", uidx, nhbuf);
511 }
512 /* Unlink all child nexhops as well, keeping the chain intact */
513 unhop_chain = unhop_base->un_nextchild;
514 while (unhop_chain != NULL) {
515 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
516 unhop_ret);
517 MPASS(unhop_chain == unhop_ret);
518 IF_DEBUG_LEVEL(LOG_DEBUG3) {
519 char nhbuf[NHOP_PRINT_BUFSIZE];
520 nhop_print_buf_any(unhop_chain->un_nhop,
521 nhbuf, sizeof(nhbuf));
522 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
523 "removed child nhop %u: %s", uidx, nhbuf);
524 }
525 unhop_chain = unhop_chain->un_nextchild;
526 }
527 }
528
529 UN_WUNLOCK(ctl);
530
531 if (unhop_base == NULL) {
532 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
533 return (ENOENT);
534 }
535
536 /* Report nexthop deletion */
537 struct netlink_walkargs wa = {
538 .hdr.nlmsg_pid = hdr->nlmsg_pid,
539 .hdr.nlmsg_seq = hdr->nlmsg_seq,
540 .hdr.nlmsg_flags = hdr->nlmsg_flags,
541 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
542 };
543
544 struct nl_writer nw = {};
545 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
546 NL_LOG(LOG_DEBUG, "error allocating message writer");
547 return (ENOMEM);
548 }
549
550 dump_unhop(unhop_base, &wa.hdr, &nw);
551 nlmsg_flush(&nw);
552
553 while (unhop_base != NULL) {
554 unhop_chain = unhop_base->un_nextchild;
555 NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
556 unhop_base = unhop_chain;
557 }
558
559 return (0);
560 }
561
562 static void
consider_resize(struct unhop_ctl * ctl,uint32_t new_size)563 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
564 {
565 void *new_ptr = NULL;
566 size_t alloc_size;
567
568 if (new_size == 0)
569 return;
570
571 if (new_size != 0) {
572 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
573 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
574 if (new_ptr == NULL)
575 return;
576 }
577
578 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
579 UN_WLOCK(ctl);
580 if (new_ptr != NULL) {
581 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
582 }
583 UN_WUNLOCK(ctl);
584
585
586 if (new_ptr != NULL)
587 free(new_ptr, M_NETLINK);
588 }
589
590 static bool __noinline
vnet_init_unhops(void)591 vnet_init_unhops(void)
592 {
593 uint32_t num_buckets = 16;
594 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
595
596 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
597 M_NOWAIT | M_ZERO);
598 if (ctl == NULL)
599 return (false);
600
601 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
602 if (ptr == NULL) {
603 free(ctl, M_NETLINK);
604 return (false);
605 }
606 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
607 UN_LOCK_INIT(ctl);
608
609 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
610 free(ptr, M_NETLINK);
611 free(ctl, M_NETLINK);
612 }
613
614 if (atomic_load_ptr(&V_un_ctl) == NULL)
615 return (false);
616
617 NL_LOG(LOG_NOTICE, "UNHOPS init done");
618
619 return (true);
620 }
621
622 static void
vnet_destroy_unhops(const void * unused __unused)623 vnet_destroy_unhops(const void *unused __unused)
624 {
625 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
626 struct user_nhop *unhop, *tmp;
627
628 if (ctl == NULL)
629 return;
630 V_un_ctl = NULL;
631
632 /* Wait till all unhop users finish their reads */
633 NET_EPOCH_WAIT();
634
635 UN_WLOCK(ctl);
636 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
637 destroy_unhop(unhop);
638 } CHT_SLIST_FOREACH_SAFE_END;
639 UN_WUNLOCK(ctl);
640
641 free(ctl->un_head.ptr, M_NETLINK);
642 free(ctl, M_NETLINK);
643 }
644 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
645 vnet_destroy_unhops, NULL);
646
647 static int
nlattr_get_nhg(struct nlattr * nla,struct nl_pstate * npt,const void * arg,void * target)648 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
649 {
650 int error = 0;
651
652 /* Verify attribute correctness */
653 struct nexthop_grp *grp = NLA_DATA(nla);
654 int data_len = NLA_DATA_LEN(nla);
655
656 int count = data_len / sizeof(*grp);
657 if (count == 0 || (count * sizeof(*grp) != data_len)) {
658 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
659 return (EINVAL);
660 }
661
662 *((struct nlattr **)target) = nla;
663 return (error);
664 }
665
666 struct nl_parsed_nhop {
667 uint32_t nha_id;
668 uint8_t nha_blackhole;
669 uint8_t nha_groups;
670 struct ifnet *nha_oif;
671 struct sockaddr *nha_gw;
672 struct nlattr *nha_group;
673 uint8_t nh_family;
674 uint8_t nh_protocol;
675 };
676
677 #define _IN(_field) offsetof(struct nhmsg, _field)
678 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field)
679 static const struct nlfield_parser nlf_p_nh[] = {
680 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
681 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
682 };
683
684 static const struct nlattr_parser nla_p_nh[] = {
685 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
686 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
687 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
688 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
689 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
690 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
691 };
692 #undef _IN
693 #undef _OUT
694 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh);
695
696 static bool
eligible_nhg(const struct nhop_object * nh)697 eligible_nhg(const struct nhop_object *nh)
698 {
699 return (nh->nh_flags & NHF_GATEWAY);
700 }
701
702 static int
newnhg(struct unhop_ctl * ctl,struct nl_parsed_nhop * attrs,struct user_nhop * unhop)703 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
704 {
705 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
706 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
707 struct weightened_nhop *wn;
708
709 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
710 if (wn == NULL)
711 return (ENOMEM);
712
713 for (int i = 0; i < count; i++) {
714 struct user_nhop *unhop;
715 unhop = nl_find_base_unhop(ctl, grp[i].id);
716 if (unhop == NULL) {
717 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
718 free(wn, M_NETLINK);
719 return (ESRCH);
720 } else if (unhop->un_nhop_src == NULL) {
721 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
722 grp[i].id);
723 free(wn, M_NETLINK);
724 return (ENOTSUP);
725 } else if (!eligible_nhg(unhop->un_nhop_src)) {
726 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
727 grp[i].id);
728 free(wn, M_NETLINK);
729 return (ENOTSUP);
730 }
731 /*
732 * TODO: consider more rigid eligibility checks:
733 * restrict nexthops with the same gateway
734 */
735 wn[i].nh = unhop->un_nhop_src;
736 wn[i].weight = grp[i].weight;
737 }
738 unhop->un_nhgrp_src = wn;
739 unhop->un_nhgrp_count = count;
740 return (0);
741 }
742
743 /*
744 * Sets nexthop @nh gateway specified by @gw.
745 * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
746 * @ifp ifindex.
747 * Returns 0 on success or errno.
748 */
749 int
nl_set_nexthop_gw(struct nhop_object * nh,struct sockaddr * gw,struct ifnet * ifp,struct nl_pstate * npt)750 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp,
751 struct nl_pstate *npt)
752 {
753 #ifdef INET6
754 if (gw->sa_family == AF_INET6) {
755 struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
756 if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
757 if (ifp == NULL) {
758 NLMSG_REPORT_ERR_MSG(npt, "interface not set");
759 return (EINVAL);
760 }
761 in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index);
762 }
763 }
764 #endif
765 nhop_set_gw(nh, gw, true);
766 return (0);
767 }
768
769 static int
newnhop(struct nl_parsed_nhop * attrs,struct user_nhop * unhop,struct nl_pstate * npt)770 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
771 {
772 struct ifaddr *ifa = NULL;
773 struct nhop_object *nh;
774 int error;
775
776 if (!attrs->nha_blackhole) {
777 if (attrs->nha_gw == NULL) {
778 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
779 return (EINVAL);
780 }
781 if (attrs->nha_oif == NULL) {
782 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
783 return (EINVAL);
784 }
785 if (ifa == NULL)
786 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
787 if (ifa == NULL) {
788 NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
789 return (EINVAL);
790 }
791 }
792
793 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
794
795 nh = nhop_alloc(RT_DEFAULT_FIB, family);
796 if (nh == NULL) {
797 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
798 return (ENOMEM);
799 }
800 nhop_set_uidx(nh, attrs->nha_id);
801
802 if (attrs->nha_blackhole)
803 nhop_set_blackhole(nh, NHF_BLACKHOLE);
804 else {
805 error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
806 if (error != 0) {
807 nhop_free(nh);
808 return (error);
809 }
810 nhop_set_transmit_ifp(nh, attrs->nha_oif);
811 nhop_set_src(nh, ifa);
812 }
813
814 error = nhop_get_unlinked(nh);
815 if (error != 0) {
816 NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
817 return (error);
818 }
819
820 IF_DEBUG_LEVEL(LOG_DEBUG2) {
821 char nhbuf[NHOP_PRINT_BUFSIZE];
822 nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
823 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
824 }
825
826 unhop->un_nhop_src = nh;
827 return (0);
828 }
829
830 static int
rtnl_handle_newnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)831 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
832 struct nl_pstate *npt)
833 {
834 struct user_nhop *unhop;
835 int error;
836
837 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
838 return (ENOMEM);
839 struct unhop_ctl *ctl = V_un_ctl;
840
841 struct nl_parsed_nhop attrs = {};
842 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
843 if (error != 0)
844 return (error);
845
846 /*
847 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
848 * citizen.
849 */
850 if (attrs.nha_id == 0) {
851 attrs.nha_id = find_spare_uidx(ctl);
852 if (attrs.nha_id == 0) {
853 NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
854 return (ENOSPC);
855 }
856 }
857
858 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0);
859
860 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
861 if (unhop == NULL) {
862 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
863 return (ENOMEM);
864 }
865 unhop->un_idx = attrs.nha_id;
866 unhop->un_protocol = attrs.nh_protocol;
867
868 if (attrs.nha_group)
869 error = newnhg(ctl, &attrs, unhop);
870 else
871 error = newnhop(&attrs, unhop, npt);
872
873 if (error != 0) {
874 free(unhop, M_NETLINK);
875 return (error);
876 }
877
878 UN_WLOCK(ctl);
879 /* Check if uidx already exists */
880 struct user_nhop *tmp = NULL;
881 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
882 if (tmp != NULL) {
883 UN_WUNLOCK(ctl);
884 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
885 destroy_unhop(unhop);
886 return (EEXIST);
887 }
888 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
889 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
890 UN_WUNLOCK(ctl);
891
892 /* Report addition of the next nexhop */
893 struct netlink_walkargs wa = {
894 .hdr.nlmsg_pid = hdr->nlmsg_pid,
895 .hdr.nlmsg_seq = hdr->nlmsg_seq,
896 .hdr.nlmsg_flags = hdr->nlmsg_flags,
897 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
898 };
899
900 struct nl_writer nw = {};
901 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
902 NL_LOG(LOG_DEBUG, "error allocating message writer");
903 return (ENOMEM);
904 }
905
906 dump_unhop(unhop, &wa.hdr, &nw);
907 nlmsg_flush(&nw);
908
909 consider_resize(ctl, num_buckets_new);
910
911 return (0);
912 }
913
914 static int
rtnl_handle_delnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)915 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
916 struct nl_pstate *npt)
917 {
918 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
919 int error;
920
921 if (__predict_false(ctl == NULL))
922 return (ESRCH);
923
924 struct nl_parsed_nhop attrs = {};
925 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
926 if (error != 0)
927 return (error);
928
929 if (attrs.nha_id == 0) {
930 NL_LOG(LOG_DEBUG, "NHA_ID not set");
931 return (EINVAL);
932 }
933
934 error = delete_unhop(ctl, hdr, attrs.nha_id);
935
936 return (error);
937 }
938
939 static bool
match_unhop(const struct nl_parsed_nhop * attrs,struct user_nhop * unhop)940 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
941 {
942 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
943 return (false);
944 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
945 return (false);
946 if (attrs->nha_oif != NULL &&
947 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
948 return (false);
949
950 return (true);
951 }
952
953 static int
rtnl_handle_getnhop(struct nlmsghdr * hdr,struct nlpcb * nlp,struct nl_pstate * npt)954 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
955 struct nl_pstate *npt)
956 {
957 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
958 struct user_nhop *unhop;
959 UN_TRACKER;
960 int error;
961
962 if (__predict_false(ctl == NULL))
963 return (ESRCH);
964
965 struct nl_parsed_nhop attrs = {};
966 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
967 if (error != 0)
968 return (error);
969
970 struct netlink_walkargs wa = {
971 .nw = npt->nw,
972 .hdr.nlmsg_pid = hdr->nlmsg_pid,
973 .hdr.nlmsg_seq = hdr->nlmsg_seq,
974 .hdr.nlmsg_flags = hdr->nlmsg_flags,
975 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
976 };
977
978 if (attrs.nha_id != 0) {
979 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
980 struct user_nhop key= { .un_idx = attrs.nha_id };
981 UN_RLOCK(ctl);
982 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
983 UN_RUNLOCK(ctl);
984
985 if (unhop == NULL)
986 return (ESRCH);
987 dump_unhop(unhop, &wa.hdr, wa.nw);
988 return (0);
989 }
990
991 UN_RLOCK(ctl);
992 wa.hdr.nlmsg_flags |= NLM_F_MULTI;
993 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
994 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
995 dump_unhop(unhop, &wa.hdr, wa.nw);
996 } CHT_SLIST_FOREACH_END;
997 UN_RUNLOCK(ctl);
998
999 if (wa.error == 0) {
1000 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
1001 return (ENOMEM);
1002 }
1003 return (0);
1004 }
1005
1006 static const struct rtnl_cmd_handler cmd_handlers[] = {
1007 {
1008 .cmd = NL_RTM_NEWNEXTHOP,
1009 .name = "RTM_NEWNEXTHOP",
1010 .cb = &rtnl_handle_newnhop,
1011 .priv = PRIV_NET_ROUTE,
1012 },
1013 {
1014 .cmd = NL_RTM_DELNEXTHOP,
1015 .name = "RTM_DELNEXTHOP",
1016 .cb = &rtnl_handle_delnhop,
1017 .priv = PRIV_NET_ROUTE,
1018 },
1019 {
1020 .cmd = NL_RTM_GETNEXTHOP,
1021 .name = "RTM_GETNEXTHOP",
1022 .cb = &rtnl_handle_getnhop,
1023 }
1024 };
1025
1026 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser };
1027
1028 void
rtnl_nexthops_init(void)1029 rtnl_nexthops_init(void)
1030 {
1031 NL_VERIFY_PARSERS(all_parsers);
1032 rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1033 }
1034