1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (C) 2013-2016 Universita` di Pisa
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29
30 #if defined(__FreeBSD__)
31 #include <sys/cdefs.h> /* prerequisite */
32 __FBSDID("$FreeBSD: stable/12/sys/dev/netmap/netmap_vale.c 372835 2022-12-31 12:40:02Z git2svn $");
33
34 #include <sys/types.h>
35 #include <sys/errno.h>
36 #include <sys/param.h> /* defines used in kernel.h */
37 #include <sys/kernel.h> /* types used in module initialization */
38 #include <sys/conf.h> /* cdevsw struct, UID, GID */
39 #include <sys/sockio.h>
40 #include <sys/socketvar.h> /* struct socket */
41 #include <sys/malloc.h>
42 #include <sys/poll.h>
43 #include <sys/rwlock.h>
44 #include <sys/socket.h> /* sockaddrs */
45 #include <sys/selinfo.h>
46 #include <sys/sysctl.h>
47 #include <net/if.h>
48 #include <net/if_var.h>
49 #include <net/bpf.h> /* BIOCIMMEDIATE */
50 #include <machine/bus.h> /* bus_dmamap_* */
51 #include <sys/endian.h>
52 #include <sys/refcount.h>
53 #include <sys/smp.h>
54
55
56 #elif defined(linux)
57
58 #include "bsd_glue.h"
59
60 #elif defined(__APPLE__)
61
62 #warning OSX support is only partial
63 #include "osx_glue.h"
64
65 #elif defined(_WIN32)
66 #include "win_glue.h"
67
68 #else
69
70 #error Unsupported platform
71
72 #endif /* unsupported */
73
74 /*
75 * common headers
76 */
77
78 #include <net/netmap.h>
79 #include <dev/netmap/netmap_kern.h>
80 #include <dev/netmap/netmap_mem2.h>
81 #include <dev/netmap/netmap_bdg.h>
82
83 #ifdef WITH_VALE
84
85 /*
86 * system parameters (most of them in netmap_kern.h)
87 * NM_BDG_NAME prefix for switch port names, default "vale"
88 * NM_BDG_MAXPORTS number of ports
89 * NM_BRIDGES max number of switches in the system.
90 *
91 * Switch ports are named valeX:Y where X is the switch name and Y
92 * is the port. If Y matches a physical interface name, the port is
93 * connected to a physical device.
94 *
95 * Unlike physical interfaces, switch ports use their own memory region
96 * for rings and buffers.
97 * The virtual interfaces use per-queue lock instead of core lock.
98 * In the tx loop, we aggregate traffic in batches to make all operations
99 * faster. The batch size is bridge_batch.
100 */
101 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
102 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
103 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
104 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
105 /* actual size of the tables */
106 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS)
107 /* NM_FT_NULL terminates a list of slots in the ft */
108 #define NM_FT_NULL NM_BDG_BATCH_MAX
109
110
111 /*
112 * bridge_batch is set via sysctl to the max batch size to be
113 * used in the bridge. The actual value may be larger as the
114 * last packet in the block may overflow the size.
115 */
116 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
117
118 /* Max number of vale bridges (loader tunable). */
119 unsigned int vale_max_bridges = NM_BRIDGES;
120
121 SYSBEGIN(vars_vale);
122 SYSCTL_DECL(_dev_netmap);
123 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
124 "Max batch size to be used in the bridge");
125 SYSCTL_UINT(_dev_netmap, OID_AUTO, max_bridges, CTLFLAG_RDTUN, &vale_max_bridges, 0,
126 "Max number of vale bridges");
127 SYSEND;
128
129 static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *,
130 struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
131 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *,
132 struct nm_bridge *);
133 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
134
135 /*
136 * For each output interface, nm_vale_q is used to construct a list.
137 * bq_len is the number of output buffers (we can have coalescing
138 * during the copy).
139 */
140 struct nm_vale_q {
141 uint16_t bq_head;
142 uint16_t bq_tail;
143 uint32_t bq_len; /* number of buffers */
144 };
145
146 /* Holds the default callbacks */
147 struct netmap_bdg_ops vale_bdg_ops = {
148 .lookup = netmap_vale_learning,
149 .config = NULL,
150 .dtor = NULL,
151 .vp_create = netmap_vale_vp_create,
152 .bwrap_attach = netmap_vale_bwrap_attach,
153 .name = NM_BDG_NAME,
154 };
155
156 /*
157 * this is a slightly optimized copy routine which rounds
158 * to multiple of 64 bytes and is often faster than dealing
159 * with other odd sizes. We assume there is enough room
160 * in the source and destination buffers.
161 *
162 * XXX only for multiples of 64 bytes, non overlapped.
163 */
164 static inline void
pkt_copy(void * _src,void * _dst,int l)165 pkt_copy(void *_src, void *_dst, int l)
166 {
167 uint64_t *src = _src;
168 uint64_t *dst = _dst;
169 if (unlikely(l >= 1024)) {
170 memcpy(dst, src, l);
171 return;
172 }
173 for (; likely(l > 0); l-=64) {
174 *dst++ = *src++;
175 *dst++ = *src++;
176 *dst++ = *src++;
177 *dst++ = *src++;
178 *dst++ = *src++;
179 *dst++ = *src++;
180 *dst++ = *src++;
181 *dst++ = *src++;
182 }
183 }
184
185
186 /*
187 * Free the forwarding tables for rings attached to switch ports.
188 */
189 static void
nm_free_bdgfwd(struct netmap_adapter * na)190 nm_free_bdgfwd(struct netmap_adapter *na)
191 {
192 int nrings, i;
193 struct netmap_kring **kring;
194
195 NMG_LOCK_ASSERT();
196 nrings = na->num_tx_rings;
197 kring = na->tx_rings;
198 for (i = 0; i < nrings; i++) {
199 if (kring[i]->nkr_ft) {
200 nm_os_free(kring[i]->nkr_ft);
201 kring[i]->nkr_ft = NULL; /* protect from freeing twice */
202 }
203 }
204 }
205
206
207 /*
208 * Allocate the forwarding tables for the rings attached to the bridge ports.
209 */
210 static int
nm_alloc_bdgfwd(struct netmap_adapter * na)211 nm_alloc_bdgfwd(struct netmap_adapter *na)
212 {
213 int nrings, l, i, num_dstq;
214 struct netmap_kring **kring;
215
216 NMG_LOCK_ASSERT();
217 /* all port:rings + broadcast */
218 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
219 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
220 l += sizeof(struct nm_vale_q) * num_dstq;
221 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
222
223 nrings = netmap_real_rings(na, NR_TX);
224 kring = na->tx_rings;
225 for (i = 0; i < nrings; i++) {
226 struct nm_bdg_fwd *ft;
227 struct nm_vale_q *dstq;
228 int j;
229
230 ft = nm_os_malloc(l);
231 if (!ft) {
232 nm_free_bdgfwd(na);
233 return ENOMEM;
234 }
235 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
236 for (j = 0; j < num_dstq; j++) {
237 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
238 dstq[j].bq_len = 0;
239 }
240 kring[i]->nkr_ft = ft;
241 }
242 return 0;
243 }
244
245 /* Allows external modules to create bridges in exclusive mode,
246 * returns an authentication token that the external module will need
247 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
248 * and nm_bdg_update_private_data() operations.
249 * Successfully executed if ret != NULL and *return_status == 0.
250 */
251 void *
netmap_vale_create(const char * bdg_name,int * return_status)252 netmap_vale_create(const char *bdg_name, int *return_status)
253 {
254 struct nm_bridge *b = NULL;
255 void *ret = NULL;
256
257 NMG_LOCK();
258 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
259 if (b) {
260 *return_status = EEXIST;
261 goto unlock_bdg_create;
262 }
263
264 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
265 if (!b) {
266 *return_status = ENOMEM;
267 goto unlock_bdg_create;
268 }
269
270 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
271 ret = nm_bdg_get_auth_token(b);
272 *return_status = 0;
273
274 unlock_bdg_create:
275 NMG_UNLOCK();
276 return ret;
277 }
278
279 /* Allows external modules to destroy a bridge created through
280 * netmap_bdg_create(), the bridge must be empty.
281 */
282 int
netmap_vale_destroy(const char * bdg_name,void * auth_token)283 netmap_vale_destroy(const char *bdg_name, void *auth_token)
284 {
285 struct nm_bridge *b = NULL;
286 int ret = 0;
287
288 NMG_LOCK();
289 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
290 if (!b) {
291 ret = ENXIO;
292 goto unlock_bdg_free;
293 }
294
295 if (!nm_bdg_valid_auth_token(b, auth_token)) {
296 ret = EACCES;
297 goto unlock_bdg_free;
298 }
299 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
300 ret = EINVAL;
301 goto unlock_bdg_free;
302 }
303
304 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
305 ret = netmap_bdg_free(b);
306 if (ret) {
307 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
308 }
309
310 unlock_bdg_free:
311 NMG_UNLOCK();
312 return ret;
313 }
314
315 /* Process NETMAP_REQ_VALE_LIST. */
316 int
netmap_vale_list(struct nmreq_header * hdr)317 netmap_vale_list(struct nmreq_header *hdr)
318 {
319 struct nmreq_vale_list *req =
320 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
321 int namelen = strlen(hdr->nr_name);
322 struct nm_bridge *b, *bridges;
323 struct netmap_vp_adapter *vpna;
324 int error = 0, i, j;
325 u_int num_bridges;
326
327 netmap_bns_getbridges(&bridges, &num_bridges);
328
329 /* this is used to enumerate bridges and ports */
330 if (namelen) { /* look up indexes of bridge and port */
331 if (strncmp(hdr->nr_name, NM_BDG_NAME,
332 strlen(NM_BDG_NAME))) {
333 return EINVAL;
334 }
335 NMG_LOCK();
336 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
337 if (!b) {
338 NMG_UNLOCK();
339 return ENOENT;
340 }
341
342 req->nr_bridge_idx = b - bridges; /* bridge index */
343 req->nr_port_idx = NM_BDG_NOPORT;
344 for (j = 0; j < b->bdg_active_ports; j++) {
345 i = b->bdg_port_index[j];
346 vpna = b->bdg_ports[i];
347 if (vpna == NULL) {
348 nm_prerr("This should not happen");
349 continue;
350 }
351 /* the former and the latter identify a
352 * virtual port and a NIC, respectively
353 */
354 if (!strcmp(vpna->up.name, hdr->nr_name)) {
355 req->nr_port_idx = i; /* port index */
356 break;
357 }
358 }
359 NMG_UNLOCK();
360 } else {
361 /* return the first non-empty entry starting from
362 * bridge nr_arg1 and port nr_arg2.
363 *
364 * Users can detect the end of the same bridge by
365 * seeing the new and old value of nr_arg1, and can
366 * detect the end of all the bridge by error != 0
367 */
368 i = req->nr_bridge_idx;
369 j = req->nr_port_idx;
370
371 NMG_LOCK();
372 for (error = ENOENT; i < vale_max_bridges; i++) {
373 b = bridges + i;
374 for ( ; j < NM_BDG_MAXPORTS; j++) {
375 if (b->bdg_ports[j] == NULL)
376 continue;
377 vpna = b->bdg_ports[j];
378 /* write back the VALE switch name */
379 strlcpy(hdr->nr_name, vpna->up.name,
380 sizeof(hdr->nr_name));
381 error = 0;
382 goto out;
383 }
384 j = 0; /* following bridges scan from 0 */
385 }
386 out:
387 req->nr_bridge_idx = i;
388 req->nr_port_idx = j;
389 NMG_UNLOCK();
390 }
391
392 return error;
393 }
394
395 /* Process NETMAP_REQ_VALE_ATTACH.
396 */
397 int
netmap_vale_attach(struct nmreq_header * hdr,void * auth_token)398 netmap_vale_attach(struct nmreq_header *hdr, void *auth_token)
399 {
400 struct nmreq_vale_attach *req =
401 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
402 struct netmap_vp_adapter * vpna;
403 struct netmap_adapter *na = NULL;
404 struct netmap_mem_d *nmd = NULL;
405 struct nm_bridge *b = NULL;
406 int error;
407
408 NMG_LOCK();
409 /* permission check for modified bridges */
410 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
411 if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
412 error = EACCES;
413 goto unlock_exit;
414 }
415
416 if (req->reg.nr_mem_id) {
417 nmd = netmap_mem_find(req->reg.nr_mem_id);
418 if (nmd == NULL) {
419 error = EINVAL;
420 goto unlock_exit;
421 }
422 }
423
424 /* check for existing one */
425 error = netmap_get_vale_na(hdr, &na, nmd, 0);
426 if (na) {
427 error = EBUSY;
428 goto unref_exit;
429 }
430 error = netmap_get_vale_na(hdr, &na,
431 nmd, 1 /* create if not exists */);
432 if (error) { /* no device */
433 goto unlock_exit;
434 }
435
436 if (na == NULL) { /* VALE prefix missing */
437 error = EINVAL;
438 goto unlock_exit;
439 }
440
441 if (NETMAP_OWNED_BY_ANY(na)) {
442 error = EBUSY;
443 goto unref_exit;
444 }
445
446 if (na->nm_bdg_ctl) {
447 /* nop for VALE ports. The bwrap needs to put the hwna
448 * in netmap mode (see netmap_bwrap_bdg_ctl)
449 */
450 error = na->nm_bdg_ctl(hdr, na);
451 if (error)
452 goto unref_exit;
453 nm_prdis("registered %s to netmap-mode", na->name);
454 }
455 vpna = (struct netmap_vp_adapter *)na;
456 req->port_index = vpna->bdg_port;
457
458 if (nmd)
459 netmap_mem_put(nmd);
460
461 NMG_UNLOCK();
462 return 0;
463
464 unref_exit:
465 netmap_adapter_put(na);
466 unlock_exit:
467 if (nmd)
468 netmap_mem_put(nmd);
469
470 NMG_UNLOCK();
471 return error;
472 }
473
474 /* Process NETMAP_REQ_VALE_DETACH.
475 */
476 int
netmap_vale_detach(struct nmreq_header * hdr,void * auth_token)477 netmap_vale_detach(struct nmreq_header *hdr, void *auth_token)
478 {
479 struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
480 struct netmap_vp_adapter *vpna;
481 struct netmap_adapter *na;
482 struct nm_bridge *b = NULL;
483 int error;
484
485 NMG_LOCK();
486 /* permission check for modified bridges */
487 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
488 if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
489 error = EACCES;
490 goto unlock_exit;
491 }
492
493 error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
494 if (error) { /* no device, or another bridge or user owns the device */
495 goto unlock_exit;
496 }
497
498 if (na == NULL) { /* VALE prefix missing */
499 error = EINVAL;
500 goto unlock_exit;
501 } else if (nm_is_bwrap(na) &&
502 ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
503 /* Don't detach a NIC with polling */
504 error = EBUSY;
505 goto unref_exit;
506 }
507
508 vpna = (struct netmap_vp_adapter *)na;
509 if (na->na_vp != vpna) {
510 /* trying to detach first attach of VALE persistent port attached
511 * to 2 bridges
512 */
513 error = EBUSY;
514 goto unref_exit;
515 }
516 nmreq_det->port_index = vpna->bdg_port;
517
518 if (na->nm_bdg_ctl) {
519 /* remove the port from bridge. The bwrap
520 * also needs to put the hwna in normal mode
521 */
522 error = na->nm_bdg_ctl(hdr, na);
523 }
524
525 unref_exit:
526 netmap_adapter_put(na);
527 unlock_exit:
528 NMG_UNLOCK();
529 return error;
530
531 }
532
533
534 /* nm_dtor callback for ephemeral VALE ports */
535 static void
netmap_vale_vp_dtor(struct netmap_adapter * na)536 netmap_vale_vp_dtor(struct netmap_adapter *na)
537 {
538 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
539 struct nm_bridge *b = vpna->na_bdg;
540
541 nm_prdis("%s has %d references", na->name, na->na_refcount);
542
543 if (b) {
544 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
545 }
546
547 if (na->ifp != NULL && !nm_iszombie(na)) {
548 NM_DETACH_NA(na->ifp);
549 if (vpna->autodelete) {
550 nm_prdis("releasing %s", na->ifp->if_xname);
551 NMG_UNLOCK();
552 nm_os_vi_detach(na->ifp);
553 NMG_LOCK();
554 }
555 }
556 }
557
558
559
560 /* nm_krings_create callback for VALE ports.
561 * Calls the standard netmap_krings_create, then adds leases on rx
562 * rings and bdgfwd on tx rings.
563 */
564 static int
netmap_vale_vp_krings_create(struct netmap_adapter * na)565 netmap_vale_vp_krings_create(struct netmap_adapter *na)
566 {
567 u_int tailroom;
568 int error, i;
569 uint32_t *leases;
570 u_int nrx = netmap_real_rings(na, NR_RX);
571
572 /*
573 * Leases are attached to RX rings on vale ports
574 */
575 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
576
577 error = netmap_krings_create(na, tailroom);
578 if (error)
579 return error;
580
581 leases = na->tailroom;
582
583 for (i = 0; i < nrx; i++) { /* Receive rings */
584 na->rx_rings[i]->nkr_leases = leases;
585 leases += na->num_rx_desc;
586 }
587
588 error = nm_alloc_bdgfwd(na);
589 if (error) {
590 netmap_krings_delete(na);
591 return error;
592 }
593
594 return 0;
595 }
596
597
598 /* nm_krings_delete callback for VALE ports. */
599 static void
netmap_vale_vp_krings_delete(struct netmap_adapter * na)600 netmap_vale_vp_krings_delete(struct netmap_adapter *na)
601 {
602 nm_free_bdgfwd(na);
603 netmap_krings_delete(na);
604 }
605
606
607 static int
608 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n,
609 struct netmap_vp_adapter *na, u_int ring_nr);
610
611
612 /*
613 * main dispatch routine for the bridge.
614 * Grab packets from a kring, move them into the ft structure
615 * associated to the tx (input) port. Max one instance per port,
616 * filtered on input (ioctl, poll or XXX).
617 * Returns the next position in the ring.
618 */
619 static int
nm_vale_preflush(struct netmap_kring * kring,u_int end)620 nm_vale_preflush(struct netmap_kring *kring, u_int end)
621 {
622 struct netmap_vp_adapter *na =
623 (struct netmap_vp_adapter*)kring->na;
624 struct netmap_ring *ring = kring->ring;
625 struct nm_bdg_fwd *ft;
626 u_int ring_nr = kring->ring_id;
627 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
628 u_int ft_i = 0; /* start from 0 */
629 u_int frags = 1; /* how many frags ? */
630 struct nm_bridge *b = na->na_bdg;
631
632 /* To protect against modifications to the bridge we acquire a
633 * shared lock, waiting if we can sleep (if the source port is
634 * attached to a user process) or with a trylock otherwise (NICs).
635 */
636 nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
637 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
638 BDG_RLOCK(b);
639 else if (!BDG_RTRYLOCK(b))
640 return j;
641 nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
642 ft = kring->nkr_ft;
643
644 for (; likely(j != end); j = nm_next(j, lim)) {
645 struct netmap_slot *slot = &ring->slot[j];
646 char *buf;
647
648 ft[ft_i].ft_len = slot->len;
649 ft[ft_i].ft_flags = slot->flags;
650 ft[ft_i].ft_offset = 0;
651
652 nm_prdis("flags is 0x%x", slot->flags);
653 /* we do not use the buf changed flag, but we still need to reset it */
654 slot->flags &= ~NS_BUF_CHANGED;
655
656 /* this slot goes into a list so initialize the link field */
657 ft[ft_i].ft_next = NM_FT_NULL;
658 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
659 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
660 if (unlikely(buf == NULL)) {
661 nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d",
662 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
663 kring->name, j, ft[ft_i].ft_len);
664 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
665 ft[ft_i].ft_len = 0;
666 ft[ft_i].ft_flags = 0;
667 }
668 __builtin_prefetch(buf);
669 ++ft_i;
670 if (slot->flags & NS_MOREFRAG) {
671 frags++;
672 continue;
673 }
674 if (unlikely(netmap_verbose && frags > 1))
675 nm_prlim(5, "%d frags at %d", frags, ft_i - frags);
676 ft[ft_i - frags].ft_frags = frags;
677 frags = 1;
678 if (unlikely((int)ft_i >= bridge_batch))
679 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
680 }
681 if (frags > 1) {
682 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
683 * have to fix frags count. */
684 frags--;
685 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
686 ft[ft_i - frags].ft_frags = frags;
687 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
688 }
689 if (ft_i)
690 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
691 BDG_RUNLOCK(b);
692 return j;
693 }
694
695
696 /* ----- FreeBSD if_bridge hash function ------- */
697
698 /*
699 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
700 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
701 *
702 * http://www.burtleburtle.net/bob/hash/spooky.html
703 */
704 #define mix(a, b, c) \
705 do { \
706 a -= b; a -= c; a ^= (c >> 13); \
707 b -= c; b -= a; b ^= (a << 8); \
708 c -= a; c -= b; c ^= (b >> 13); \
709 a -= b; a -= c; a ^= (c >> 12); \
710 b -= c; b -= a; b ^= (a << 16); \
711 c -= a; c -= b; c ^= (b >> 5); \
712 a -= b; a -= c; a ^= (c >> 3); \
713 b -= c; b -= a; b ^= (a << 10); \
714 c -= a; c -= b; c ^= (b >> 15); \
715 } while (/*CONSTCOND*/0)
716
717
718 static __inline uint32_t
nm_vale_rthash(const uint8_t * addr)719 nm_vale_rthash(const uint8_t *addr)
720 {
721 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
722
723 b += addr[5] << 8;
724 b += addr[4];
725 a += addr[3] << 24;
726 a += addr[2] << 16;
727 a += addr[1] << 8;
728 a += addr[0];
729
730 mix(a, b, c);
731 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
732 return (c & BRIDGE_RTHASH_MASK);
733 }
734
735 #undef mix
736
737
738 /*
739 * Lookup function for a learning bridge.
740 * Update the hash table with the source address,
741 * and then returns the destination port index, and the
742 * ring in *dst_ring (at the moment, always use ring 0)
743 */
744 uint32_t
netmap_vale_learning(struct nm_bdg_fwd * ft,uint8_t * dst_ring,struct netmap_vp_adapter * na,void * private_data)745 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
746 struct netmap_vp_adapter *na, void *private_data)
747 {
748 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
749 u_int buf_len = ft->ft_len - ft->ft_offset;
750 struct nm_hash_ent *ht = private_data;
751 uint32_t sh, dh;
752 u_int dst, mysrc = na->bdg_port;
753 uint64_t smac, dmac;
754 uint8_t indbuf[12];
755
756 if (buf_len < 14) {
757 return NM_BDG_NOPORT;
758 }
759
760 if (ft->ft_flags & NS_INDIRECT) {
761 if (copyin(buf, indbuf, sizeof(indbuf))) {
762 return NM_BDG_NOPORT;
763 }
764 buf = indbuf;
765 }
766
767 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
768 smac = le64toh(*(uint64_t *)(buf + 4));
769 smac >>= 16;
770
771 /*
772 * The hash is somewhat expensive, there might be some
773 * worthwhile optimizations here.
774 */
775 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
776 uint8_t *s = buf+6;
777 sh = nm_vale_rthash(s); /* hash of source */
778 /* update source port forwarding entry */
779 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */
780 ht[sh].ports = mysrc;
781 if (netmap_debug & NM_DEBUG_VALE)
782 nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
783 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
784 }
785 dst = NM_BDG_BROADCAST;
786 if ((buf[0] & 1) == 0) { /* unicast */
787 dh = nm_vale_rthash(buf); /* hash of dst */
788 if (ht[dh].mac == dmac) { /* found dst */
789 dst = ht[dh].ports;
790 }
791 }
792 return dst;
793 }
794
795
796 /*
797 * Available space in the ring. Only used in VALE code
798 * and only with is_rx = 1
799 */
800 static inline uint32_t
nm_kr_space(struct netmap_kring * k,int is_rx)801 nm_kr_space(struct netmap_kring *k, int is_rx)
802 {
803 int space;
804
805 if (is_rx) {
806 int busy = k->nkr_hwlease - k->nr_hwcur;
807 if (busy < 0)
808 busy += k->nkr_num_slots;
809 space = k->nkr_num_slots - 1 - busy;
810 } else {
811 /* XXX never used in this branch */
812 space = k->nr_hwtail - k->nkr_hwlease;
813 if (space < 0)
814 space += k->nkr_num_slots;
815 }
816 #if 0
817 // sanity check
818 if (k->nkr_hwlease >= k->nkr_num_slots ||
819 k->nr_hwcur >= k->nkr_num_slots ||
820 k->nr_tail >= k->nkr_num_slots ||
821 busy < 0 ||
822 busy >= k->nkr_num_slots) {
823 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",
824 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
825 k->nkr_lease_idx, k->nkr_num_slots);
826 }
827 #endif
828 return space;
829 }
830
831
832
833
834 /* make a lease on the kring for N positions. return the
835 * lease index
836 * XXX only used in VALE code and with is_rx = 1
837 */
838 static inline uint32_t
nm_kr_lease(struct netmap_kring * k,u_int n,int is_rx)839 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
840 {
841 uint32_t lim = k->nkr_num_slots - 1;
842 uint32_t lease_idx = k->nkr_lease_idx;
843
844 k->nkr_leases[lease_idx] = NR_NOSLOT;
845 k->nkr_lease_idx = nm_next(lease_idx, lim);
846
847 #ifdef CONFIG_NETMAP_DEBUG
848 if (n > nm_kr_space(k, is_rx)) {
849 nm_prerr("invalid request for %d slots", n);
850 panic("x");
851 }
852 #endif /* CONFIG NETMAP_DEBUG */
853 /* XXX verify that there are n slots */
854 k->nkr_hwlease += n;
855 if (k->nkr_hwlease > lim)
856 k->nkr_hwlease -= lim + 1;
857
858 #ifdef CONFIG_NETMAP_DEBUG
859 if (k->nkr_hwlease >= k->nkr_num_slots ||
860 k->nr_hwcur >= k->nkr_num_slots ||
861 k->nr_hwtail >= k->nkr_num_slots ||
862 k->nkr_lease_idx >= k->nkr_num_slots) {
863 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
864 k->na->name,
865 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
866 k->nkr_lease_idx, k->nkr_num_slots);
867 }
868 #endif /* CONFIG_NETMAP_DEBUG */
869 return lease_idx;
870 }
871
872 /*
873 *
874 * This flush routine supports only unicast and broadcast but a large
875 * number of ports, and lets us replace the learn and dispatch functions.
876 */
877 int
nm_vale_flush(struct nm_bdg_fwd * ft,u_int n,struct netmap_vp_adapter * na,u_int ring_nr)878 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
879 u_int ring_nr)
880 {
881 struct nm_vale_q *dst_ents, *brddst;
882 uint16_t num_dsts = 0, *dsts;
883 struct nm_bridge *b = na->na_bdg;
884 u_int i, me = na->bdg_port;
885
886 /*
887 * The work area (pointed by ft) is followed by an array of
888 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
889 * queues per port plus one for the broadcast traffic.
890 * Then we have an array of destination indexes.
891 */
892 dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
893 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
894
895 /* first pass: find a destination for each packet in the batch */
896 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
897 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
898 uint16_t dst_port, d_i;
899 struct nm_vale_q *d;
900 struct nm_bdg_fwd *start_ft = NULL;
901
902 nm_prdis("slot %d frags %d", i, ft[i].ft_frags);
903
904 if (na->up.virt_hdr_len < ft[i].ft_len) {
905 ft[i].ft_offset = na->up.virt_hdr_len;
906 start_ft = &ft[i];
907 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
908 ft[i].ft_offset = ft[i].ft_len;
909 start_ft = &ft[i+1];
910 } else {
911 /* Drop the packet if the virtio-net header is not into the first
912 * fragment nor at the very beginning of the second.
913 */
914 continue;
915 }
916 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data);
917 if (netmap_verbose > 255)
918 nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port);
919 if (dst_port >= NM_BDG_NOPORT)
920 continue; /* this packet is identified to be dropped */
921 else if (dst_port == NM_BDG_BROADCAST)
922 dst_ring = 0; /* broadcasts always go to ring 0 */
923 else if (unlikely(dst_port == me ||
924 !b->bdg_ports[dst_port]))
925 continue;
926
927 /* get a position in the scratch pad */
928 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
929 d = dst_ents + d_i;
930
931 /* append the first fragment to the list */
932 if (d->bq_head == NM_FT_NULL) { /* new destination */
933 d->bq_head = d->bq_tail = i;
934 /* remember this position to be scanned later */
935 if (dst_port != NM_BDG_BROADCAST)
936 dsts[num_dsts++] = d_i;
937 } else {
938 ft[d->bq_tail].ft_next = i;
939 d->bq_tail = i;
940 }
941 d->bq_len += ft[i].ft_frags;
942 }
943
944 /*
945 * Broadcast traffic goes to ring 0 on all destinations.
946 * So we need to add these rings to the list of ports to scan.
947 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
948 * expensive. We should keep a compact list of active destinations
949 * so we could shorten this loop.
950 */
951 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
952 if (brddst->bq_head != NM_FT_NULL) {
953 u_int j;
954 for (j = 0; likely(j < b->bdg_active_ports); j++) {
955 uint16_t d_i;
956 i = b->bdg_port_index[j];
957 if (unlikely(i == me))
958 continue;
959 d_i = i * NM_BDG_MAXRINGS;
960 if (dst_ents[d_i].bq_head == NM_FT_NULL)
961 dsts[num_dsts++] = d_i;
962 }
963 }
964
965 nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
966 /* second pass: scan destinations */
967 for (i = 0; i < num_dsts; i++) {
968 struct netmap_vp_adapter *dst_na;
969 struct netmap_kring *kring;
970 struct netmap_ring *ring;
971 u_int dst_nr, lim, j, d_i, next, brd_next;
972 u_int needed, howmany;
973 int retry = netmap_txsync_retry;
974 struct nm_vale_q *d;
975 uint32_t my_start = 0, lease_idx = 0;
976 int nrings;
977 int virt_hdr_mismatch = 0;
978
979 d_i = dsts[i];
980 nm_prdis("second pass %d port %d", i, d_i);
981 d = dst_ents + d_i;
982 // XXX fix the division
983 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
984 /* protect from the lookup function returning an inactive
985 * destination port
986 */
987 if (unlikely(dst_na == NULL))
988 goto cleanup;
989 if (dst_na->up.na_flags & NAF_SW_ONLY)
990 goto cleanup;
991 /*
992 * The interface may be in !netmap mode in two cases:
993 * - when na is attached but not activated yet;
994 * - when na is being deactivated but is still attached.
995 */
996 if (unlikely(!nm_netmap_on(&dst_na->up))) {
997 nm_prdis("not in netmap mode!");
998 goto cleanup;
999 }
1000
1001 /* there is at least one either unicast or broadcast packet */
1002 brd_next = brddst->bq_head;
1003 next = d->bq_head;
1004 /* we need to reserve this many slots. If fewer are
1005 * available, some packets will be dropped.
1006 * Packets may have multiple fragments, so we may not use
1007 * there is a chance that we may not use all of the slots
1008 * we have claimed, so we will need to handle the leftover
1009 * ones when we regain the lock.
1010 */
1011 needed = d->bq_len + brddst->bq_len;
1012
1013 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1014 if (netmap_verbose) {
1015 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1016 dst_na->up.virt_hdr_len);
1017 }
1018 /* There is a virtio-net header/offloadings mismatch between
1019 * source and destination. The slower mismatch datapath will
1020 * be used to cope with all the mismatches.
1021 */
1022 virt_hdr_mismatch = 1;
1023 if (dst_na->mfs < na->mfs) {
1024 /* We may need to do segmentation offloadings, and so
1025 * we may need a number of destination slots greater
1026 * than the number of input slots ('needed').
1027 * We look for the smallest integer 'x' which satisfies:
1028 * needed * na->mfs + x * H <= x * na->mfs
1029 * where 'H' is the length of the longest header that may
1030 * be replicated in the segmentation process (e.g. for
1031 * TCPv4 we must account for ethernet header, IP header
1032 * and TCPv4 header).
1033 */
1034 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
1035 needed = (needed * na->mfs) /
1036 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1037 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1038 }
1039 }
1040
1041 nm_prdis(5, "pass 2 dst %d is %x %s",
1042 i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual");
1043 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1044 nrings = dst_na->up.num_rx_rings;
1045 if (dst_nr >= nrings)
1046 dst_nr = dst_nr % nrings;
1047 kring = dst_na->up.rx_rings[dst_nr];
1048 ring = kring->ring;
1049 /* the destination ring may have not been opened for RX */
1050 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
1051 goto cleanup;
1052 lim = kring->nkr_num_slots - 1;
1053
1054 retry:
1055
1056 if (dst_na->retry && retry) {
1057 /* try to get some free slot from the previous run */
1058 kring->nm_notify(kring, NAF_FORCE_RECLAIM);
1059 /* actually useful only for bwraps, since there
1060 * the notify will trigger a txsync on the hwna. VALE ports
1061 * have dst_na->retry == 0
1062 */
1063 }
1064 /* reserve the buffers in the queue and an entry
1065 * to report completion, and drop lock.
1066 * XXX this might become a helper function.
1067 */
1068 mtx_lock(&kring->q_lock);
1069 if (kring->nkr_stopped) {
1070 mtx_unlock(&kring->q_lock);
1071 goto cleanup;
1072 }
1073 my_start = j = kring->nkr_hwlease;
1074 howmany = nm_kr_space(kring, 1);
1075 if (needed < howmany)
1076 howmany = needed;
1077 lease_idx = nm_kr_lease(kring, howmany, 1);
1078 mtx_unlock(&kring->q_lock);
1079
1080 /* only retry if we need more than available slots */
1081 if (retry && needed <= howmany)
1082 retry = 0;
1083
1084 /* copy to the destination queue */
1085 while (howmany > 0) {
1086 struct netmap_slot *slot;
1087 struct nm_bdg_fwd *ft_p, *ft_end;
1088 u_int cnt;
1089
1090 /* find the queue from which we pick next packet.
1091 * NM_FT_NULL is always higher than valid indexes
1092 * so we never dereference it if the other list
1093 * has packets (and if both are empty we never
1094 * get here).
1095 */
1096 if (next < brd_next) {
1097 ft_p = ft + next;
1098 next = ft_p->ft_next;
1099 } else { /* insert broadcast */
1100 ft_p = ft + brd_next;
1101 brd_next = ft_p->ft_next;
1102 }
1103 cnt = ft_p->ft_frags; // cnt > 0
1104 if (unlikely(cnt > howmany))
1105 break; /* no more space */
1106 if (netmap_verbose && cnt > 1)
1107 nm_prlim(5, "rx %d frags to %d", cnt, j);
1108 ft_end = ft_p + cnt;
1109 if (unlikely(virt_hdr_mismatch)) {
1110 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1111 } else {
1112 howmany -= cnt;
1113 do {
1114 char *dst, *src = ft_p->ft_buf;
1115 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1116
1117 slot = &ring->slot[j];
1118 dst = NMB(&dst_na->up, slot);
1119
1120 nm_prdis("send [%d] %d(%d) bytes at %s:%d",
1121 i, (int)copy_len, (int)dst_len,
1122 dst_na->up.name, j);
1123 /* round to a multiple of 64 */
1124 copy_len = (copy_len + 63) & ~63;
1125
1126 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1127 copy_len > NETMAP_BUF_SIZE(&na->up))) {
1128 nm_prlim(5, "invalid len %d, down to 64", (int)copy_len);
1129 copy_len = dst_len = 64; // XXX
1130 }
1131 if (ft_p->ft_flags & NS_INDIRECT) {
1132 if (copyin(src, dst, copy_len)) {
1133 // invalid user pointer, pretend len is 0
1134 dst_len = 0;
1135 }
1136 } else {
1137 //memcpy(dst, src, copy_len);
1138 pkt_copy(src, dst, (int)copy_len);
1139 }
1140 slot->len = dst_len;
1141 slot->flags = (cnt << 8)| NS_MOREFRAG;
1142 j = nm_next(j, lim);
1143 needed--;
1144 ft_p++;
1145 } while (ft_p != ft_end);
1146 slot->flags = (cnt << 8); /* clear flag on last entry */
1147 }
1148 /* are we done ? */
1149 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1150 break;
1151 }
1152 {
1153 /* current position */
1154 uint32_t *p = kring->nkr_leases; /* shorthand */
1155 uint32_t update_pos;
1156 int still_locked = 1;
1157
1158 mtx_lock(&kring->q_lock);
1159 if (unlikely(howmany > 0)) {
1160 /* not used all bufs. If i am the last one
1161 * i can recover the slots, otherwise must
1162 * fill them with 0 to mark empty packets.
1163 */
1164 nm_prdis("leftover %d bufs", howmany);
1165 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1166 /* yes i am the last one */
1167 nm_prdis("roll back nkr_hwlease to %d", j);
1168 kring->nkr_hwlease = j;
1169 } else {
1170 while (howmany-- > 0) {
1171 ring->slot[j].len = 0;
1172 ring->slot[j].flags = 0;
1173 j = nm_next(j, lim);
1174 }
1175 }
1176 }
1177 p[lease_idx] = j; /* report I am done */
1178
1179 update_pos = kring->nr_hwtail;
1180
1181 if (my_start == update_pos) {
1182 /* all slots before my_start have been reported,
1183 * so scan subsequent leases to see if other ranges
1184 * have been completed, and to a selwakeup or txsync.
1185 */
1186 while (lease_idx != kring->nkr_lease_idx &&
1187 p[lease_idx] != NR_NOSLOT) {
1188 j = p[lease_idx];
1189 p[lease_idx] = NR_NOSLOT;
1190 lease_idx = nm_next(lease_idx, lim);
1191 }
1192 /* j is the new 'write' position. j != my_start
1193 * means there are new buffers to report
1194 */
1195 if (likely(j != my_start)) {
1196 kring->nr_hwtail = j;
1197 still_locked = 0;
1198 mtx_unlock(&kring->q_lock);
1199 kring->nm_notify(kring, 0);
1200 /* this is netmap_notify for VALE ports and
1201 * netmap_bwrap_notify for bwrap. The latter will
1202 * trigger a txsync on the underlying hwna
1203 */
1204 if (dst_na->retry && retry--) {
1205 /* XXX this is going to call nm_notify again.
1206 * Only useful for bwrap in virtual machines
1207 */
1208 goto retry;
1209 }
1210 }
1211 }
1212 if (still_locked)
1213 mtx_unlock(&kring->q_lock);
1214 }
1215 cleanup:
1216 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1217 d->bq_len = 0;
1218 }
1219 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1220 brddst->bq_len = 0;
1221 return 0;
1222 }
1223
1224 /* nm_txsync callback for VALE ports */
1225 static int
netmap_vale_vp_txsync(struct netmap_kring * kring,int flags)1226 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags)
1227 {
1228 struct netmap_vp_adapter *na =
1229 (struct netmap_vp_adapter *)kring->na;
1230 u_int done;
1231 u_int const lim = kring->nkr_num_slots - 1;
1232 u_int const head = kring->rhead;
1233
1234 if (bridge_batch <= 0) { /* testing only */
1235 done = head; // used all
1236 goto done;
1237 }
1238 if (!na->na_bdg) {
1239 done = head;
1240 goto done;
1241 }
1242 if (bridge_batch > NM_BDG_BATCH)
1243 bridge_batch = NM_BDG_BATCH;
1244
1245 done = nm_vale_preflush(kring, head);
1246 done:
1247 if (done != head)
1248 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1249 /*
1250 * packets between 'done' and 'cur' are left unsent.
1251 */
1252 kring->nr_hwcur = done;
1253 kring->nr_hwtail = nm_prev(done, lim);
1254 if (netmap_debug & NM_DEBUG_TXSYNC)
1255 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1256 return 0;
1257 }
1258
1259
1260 /* create a netmap_vp_adapter that describes a VALE port.
1261 * Only persistent VALE ports have a non-null ifp.
1262 */
1263 static int
netmap_vale_vp_create(struct nmreq_header * hdr,struct ifnet * ifp,struct netmap_mem_d * nmd,struct netmap_vp_adapter ** ret)1264 netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
1265 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1266 {
1267 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1268 struct netmap_vp_adapter *vpna;
1269 struct netmap_adapter *na;
1270 int error = 0;
1271 u_int npipes = 0;
1272 u_int extrabufs = 0;
1273
1274 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1275 return EINVAL;
1276 }
1277
1278 vpna = nm_os_malloc(sizeof(*vpna));
1279 if (vpna == NULL)
1280 return ENOMEM;
1281
1282 na = &vpna->up;
1283
1284 na->ifp = ifp;
1285 strlcpy(na->name, hdr->nr_name, sizeof(na->name));
1286
1287 /* bound checking */
1288 na->num_tx_rings = req->nr_tx_rings;
1289 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1290 req->nr_tx_rings = na->num_tx_rings; /* write back */
1291 na->num_rx_rings = req->nr_rx_rings;
1292 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1293 req->nr_rx_rings = na->num_rx_rings; /* write back */
1294 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1295 1, NM_BDG_MAXSLOTS, NULL);
1296 na->num_tx_desc = req->nr_tx_slots;
1297 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1298 1, NM_BDG_MAXSLOTS, NULL);
1299 /* validate number of pipes. We want at least 1,
1300 * but probably can do with some more.
1301 * So let's use 2 as default (when 0 is supplied)
1302 */
1303 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1304 /* validate extra bufs */
1305 extrabufs = req->nr_extra_bufs;
1306 nm_bound_var(&extrabufs, 0, 0,
1307 128*NM_BDG_MAXSLOTS, NULL);
1308 req->nr_extra_bufs = extrabufs; /* write back */
1309 na->num_rx_desc = req->nr_rx_slots;
1310 /* Set the mfs to a default value, as it is needed on the VALE
1311 * mismatch datapath. XXX We should set it according to the MTU
1312 * known to the kernel. */
1313 vpna->mfs = NM_BDG_MFS_DEFAULT;
1314 vpna->last_smac = ~0llu;
1315 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
1316 vpna->mfs = netmap_buf_size; */
1317 if (netmap_verbose)
1318 nm_prinf("max frame size %u", vpna->mfs);
1319
1320 na->na_flags |= NAF_BDG_MAYSLEEP;
1321 /* persistent VALE ports look like hw devices
1322 * with a native netmap adapter
1323 */
1324 if (ifp)
1325 na->na_flags |= NAF_NATIVE;
1326 na->nm_txsync = netmap_vale_vp_txsync;
1327 na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */
1328 na->nm_register = netmap_vp_reg; /* use the one provided by bdg */
1329 na->nm_krings_create = netmap_vale_vp_krings_create;
1330 na->nm_krings_delete = netmap_vale_vp_krings_delete;
1331 na->nm_dtor = netmap_vale_vp_dtor;
1332 nm_prdis("nr_mem_id %d", req->nr_mem_id);
1333 na->nm_mem = nmd ?
1334 netmap_mem_get(nmd):
1335 netmap_mem_private_new(
1336 na->num_tx_rings, na->num_tx_desc,
1337 na->num_rx_rings, na->num_rx_desc,
1338 req->nr_extra_bufs, npipes, &error);
1339 if (na->nm_mem == NULL)
1340 goto err;
1341 na->nm_bdg_attach = netmap_vale_vp_bdg_attach;
1342 /* other nmd fields are set in the common routine */
1343 error = netmap_attach_common(na);
1344 if (error)
1345 goto err;
1346 *ret = vpna;
1347 return 0;
1348
1349 err:
1350 if (na->nm_mem != NULL)
1351 netmap_mem_put(na->nm_mem);
1352 nm_os_free(vpna);
1353 return error;
1354 }
1355
1356 /* nm_bdg_attach callback for VALE ports
1357 * The na_vp port is this same netmap_adapter. There is no host port.
1358 */
1359 static int
netmap_vale_vp_bdg_attach(const char * name,struct netmap_adapter * na,struct nm_bridge * b)1360 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1361 struct nm_bridge *b)
1362 {
1363 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1364
1365 if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) {
1366 return NM_NEED_BWRAP;
1367 }
1368 na->na_vp = vpna;
1369 strlcpy(na->name, name, sizeof(na->name));
1370 na->na_hostvp = NULL;
1371 return 0;
1372 }
1373
1374 static int
netmap_vale_bwrap_krings_create(struct netmap_adapter * na)1375 netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1376 {
1377 int error;
1378
1379 /* impersonate a netmap_vp_adapter */
1380 error = netmap_vale_vp_krings_create(na);
1381 if (error)
1382 return error;
1383 error = netmap_bwrap_krings_create_common(na);
1384 if (error) {
1385 netmap_vale_vp_krings_delete(na);
1386 }
1387 return error;
1388 }
1389
1390 static void
netmap_vale_bwrap_krings_delete(struct netmap_adapter * na)1391 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1392 {
1393 netmap_bwrap_krings_delete_common(na);
1394 netmap_vale_vp_krings_delete(na);
1395 }
1396
1397 static int
netmap_vale_bwrap_attach(const char * nr_name,struct netmap_adapter * hwna)1398 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1399 {
1400 struct netmap_bwrap_adapter *bna;
1401 struct netmap_adapter *na = NULL;
1402 struct netmap_adapter *hostna = NULL;
1403 int error;
1404
1405 bna = nm_os_malloc(sizeof(*bna));
1406 if (bna == NULL) {
1407 return ENOMEM;
1408 }
1409 na = &bna->up.up;
1410 strlcpy(na->name, nr_name, sizeof(na->name));
1411 na->nm_register = netmap_bwrap_reg;
1412 na->nm_txsync = netmap_vale_vp_txsync;
1413 // na->nm_rxsync = netmap_bwrap_rxsync;
1414 na->nm_krings_create = netmap_vale_bwrap_krings_create;
1415 na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1416 na->nm_notify = netmap_bwrap_notify;
1417 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1418 /* Set the mfs, needed on the VALE mismatch datapath. */
1419 bna->up.mfs = NM_BDG_MFS_DEFAULT;
1420
1421 if (hwna->na_flags & NAF_HOST_RINGS) {
1422 hostna = &bna->host.up;
1423 hostna->nm_notify = netmap_bwrap_notify;
1424 bna->host.mfs = NM_BDG_MFS_DEFAULT;
1425 }
1426
1427 error = netmap_bwrap_attach_common(na, hwna);
1428 if (error) {
1429 nm_os_free(bna);
1430 }
1431 return error;
1432 }
1433
1434 int
netmap_get_vale_na(struct nmreq_header * hdr,struct netmap_adapter ** na,struct netmap_mem_d * nmd,int create)1435 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1436 struct netmap_mem_d *nmd, int create)
1437 {
1438 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1439 }
1440
1441
1442 /* creates a persistent VALE port */
1443 int
nm_vi_create(struct nmreq_header * hdr)1444 nm_vi_create(struct nmreq_header *hdr)
1445 {
1446 struct nmreq_vale_newif *req =
1447 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1448 int error = 0;
1449 /* Build a nmreq_register out of the nmreq_vale_newif,
1450 * so that we can call netmap_get_bdg_na(). */
1451 struct nmreq_register regreq;
1452 bzero(®req, sizeof(regreq));
1453 regreq.nr_tx_slots = req->nr_tx_slots;
1454 regreq.nr_rx_slots = req->nr_rx_slots;
1455 regreq.nr_tx_rings = req->nr_tx_rings;
1456 regreq.nr_rx_rings = req->nr_rx_rings;
1457 regreq.nr_mem_id = req->nr_mem_id;
1458 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1459 hdr->nr_body = (uintptr_t)®req;
1460 error = netmap_vi_create(hdr, 0 /* no autodelete */);
1461 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1462 hdr->nr_body = (uintptr_t)req;
1463 /* Write back to the original struct. */
1464 req->nr_tx_slots = regreq.nr_tx_slots;
1465 req->nr_rx_slots = regreq.nr_rx_slots;
1466 req->nr_tx_rings = regreq.nr_tx_rings;
1467 req->nr_rx_rings = regreq.nr_rx_rings;
1468 req->nr_mem_id = regreq.nr_mem_id;
1469 return error;
1470 }
1471
1472 /* remove a persistent VALE port from the system */
1473 int
nm_vi_destroy(const char * name)1474 nm_vi_destroy(const char *name)
1475 {
1476 struct ifnet *ifp;
1477 struct netmap_vp_adapter *vpna;
1478 int error;
1479
1480 ifp = ifunit_ref(name);
1481 if (!ifp)
1482 return ENXIO;
1483 NMG_LOCK();
1484 /* make sure this is actually a VALE port */
1485 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1486 error = EINVAL;
1487 goto err;
1488 }
1489
1490 vpna = (struct netmap_vp_adapter *)NA(ifp);
1491
1492 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1493 if (vpna->autodelete) {
1494 error = EINVAL;
1495 goto err;
1496 }
1497
1498 /* also make sure that nobody is using the inferface */
1499 if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1500 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1501 error = EBUSY;
1502 goto err;
1503 }
1504
1505 NMG_UNLOCK();
1506
1507 if (netmap_verbose)
1508 nm_prinf("destroying a persistent vale interface %s", ifp->if_xname);
1509 /* Linux requires all the references are released
1510 * before unregister
1511 */
1512 netmap_detach(ifp);
1513 if_rele(ifp);
1514 nm_os_vi_detach(ifp);
1515 return 0;
1516
1517 err:
1518 NMG_UNLOCK();
1519 if_rele(ifp);
1520 return error;
1521 }
1522
1523 static int
nm_update_info(struct nmreq_register * req,struct netmap_adapter * na)1524 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1525 {
1526 req->nr_rx_rings = na->num_rx_rings;
1527 req->nr_tx_rings = na->num_tx_rings;
1528 req->nr_rx_slots = na->num_rx_desc;
1529 req->nr_tx_slots = na->num_tx_desc;
1530 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1531 &req->nr_mem_id);
1532 }
1533
1534
1535 /*
1536 * Create a virtual interface registered to the system.
1537 * The interface will be attached to a bridge later.
1538 */
1539 int
netmap_vi_create(struct nmreq_header * hdr,int autodelete)1540 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1541 {
1542 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1543 struct ifnet *ifp;
1544 struct netmap_vp_adapter *vpna;
1545 struct netmap_mem_d *nmd = NULL;
1546 int error;
1547
1548 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1549 return EINVAL;
1550 }
1551
1552 /* don't include VALE prefix */
1553 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1554 return EINVAL;
1555 if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1556 return EINVAL;
1557 }
1558 ifp = ifunit_ref(hdr->nr_name);
1559 if (ifp) { /* already exist, cannot create new one */
1560 error = EEXIST;
1561 NMG_LOCK();
1562 if (NM_NA_VALID(ifp)) {
1563 int update_err = nm_update_info(req, NA(ifp));
1564 if (update_err)
1565 error = update_err;
1566 }
1567 NMG_UNLOCK();
1568 if_rele(ifp);
1569 return error;
1570 }
1571 error = nm_os_vi_persist(hdr->nr_name, &ifp);
1572 if (error)
1573 return error;
1574
1575 NMG_LOCK();
1576 if (req->nr_mem_id) {
1577 nmd = netmap_mem_find(req->nr_mem_id);
1578 if (nmd == NULL) {
1579 error = EINVAL;
1580 goto err_1;
1581 }
1582 }
1583 /* netmap_vp_create creates a struct netmap_vp_adapter */
1584 error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna);
1585 if (error) {
1586 if (netmap_debug & NM_DEBUG_VALE)
1587 nm_prerr("error %d", error);
1588 goto err_1;
1589 }
1590 /* persist-specific routines */
1591 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1592 if (!autodelete) {
1593 netmap_adapter_get(&vpna->up);
1594 } else {
1595 vpna->autodelete = 1;
1596 }
1597 NM_ATTACH_NA(ifp, &vpna->up);
1598 /* return the updated info */
1599 error = nm_update_info(req, &vpna->up);
1600 if (error) {
1601 goto err_2;
1602 }
1603 nm_prdis("returning nr_mem_id %d", req->nr_mem_id);
1604 if (nmd)
1605 netmap_mem_put(nmd);
1606 NMG_UNLOCK();
1607 nm_prdis("created %s", ifp->if_xname);
1608 return 0;
1609
1610 err_2:
1611 netmap_detach(ifp);
1612 err_1:
1613 if (nmd)
1614 netmap_mem_put(nmd);
1615 NMG_UNLOCK();
1616 nm_os_vi_detach(ifp);
1617
1618 return error;
1619 }
1620
1621 #endif /* WITH_VALE */
1622