1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2013-2016 Universita` di Pisa
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29
30 #if defined(__FreeBSD__)
31 #include <sys/cdefs.h> /* prerequisite */
32 #include <sys/types.h>
33 #include <sys/errno.h>
34 #include <sys/param.h> /* defines used in kernel.h */
35 #include <sys/kernel.h> /* types used in module initialization */
36 #include <sys/conf.h> /* cdevsw struct, UID, GID */
37 #include <sys/sockio.h>
38 #include <sys/socketvar.h> /* struct socket */
39 #include <sys/malloc.h>
40 #include <sys/poll.h>
41 #include <sys/rwlock.h>
42 #include <sys/socket.h> /* sockaddrs */
43 #include <sys/selinfo.h>
44 #include <sys/sysctl.h>
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/bpf.h> /* BIOCIMMEDIATE */
48 #include <machine/bus.h> /* bus_dmamap_* */
49 #include <sys/endian.h>
50 #include <sys/refcount.h>
51 #include <sys/smp.h>
52
53
54 #elif defined(linux)
55
56 #include "bsd_glue.h"
57
58 #elif defined(__APPLE__)
59
60 #warning OSX support is only partial
61 #include "osx_glue.h"
62
63 #elif defined(_WIN32)
64 #include "win_glue.h"
65
66 #else
67
68 #error Unsupported platform
69
70 #endif /* unsupported */
71
72 /*
73 * common headers
74 */
75
76 #include <net/netmap.h>
77 #include <dev/netmap/netmap_kern.h>
78 #include <dev/netmap/netmap_mem2.h>
79 #include <dev/netmap/netmap_bdg.h>
80
81 #ifdef WITH_VALE
82
83 /*
84 * system parameters (most of them in netmap_kern.h)
85 * NM_BDG_NAME prefix for switch port names, default "vale"
86 * NM_BDG_MAXPORTS number of ports
87 * NM_BRIDGES max number of switches in the system.
88 *
89 * Switch ports are named valeX:Y where X is the switch name and Y
90 * is the port. If Y matches a physical interface name, the port is
91 * connected to a physical device.
92 *
93 * Unlike physical interfaces, switch ports use their own memory region
94 * for rings and buffers.
95 * The virtual interfaces use per-queue lock instead of core lock.
96 * In the tx loop, we aggregate traffic in batches to make all operations
97 * faster. The batch size is bridge_batch.
98 */
99 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
100 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
101 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
102 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
103 /* actual size of the tables */
104 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS)
105 /* NM_FT_NULL terminates a list of slots in the ft */
106 #define NM_FT_NULL NM_BDG_BATCH_MAX
107
108
109 /*
110 * bridge_batch is set via sysctl to the max batch size to be
111 * used in the bridge. The actual value may be larger as the
112 * last packet in the block may overflow the size.
113 */
114 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
115
116 /* Max number of vale bridges (loader tunable). */
117 unsigned int vale_max_bridges = NM_BRIDGES;
118
119 SYSBEGIN(vars_vale);
120 SYSCTL_DECL(_dev_netmap);
121 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
122 "Max batch size to be used in the bridge");
123 SYSCTL_UINT(_dev_netmap, OID_AUTO, max_bridges, CTLFLAG_RDTUN, &vale_max_bridges, 0,
124 "Max number of vale bridges");
125 SYSEND;
126
127 static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *,
128 struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
129 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *,
130 struct nm_bridge *);
131 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
132
133 /*
134 * For each output interface, nm_vale_q is used to construct a list.
135 * bq_len is the number of output buffers (we can have coalescing
136 * during the copy).
137 */
138 struct nm_vale_q {
139 uint16_t bq_head;
140 uint16_t bq_tail;
141 uint32_t bq_len; /* number of buffers */
142 };
143
144 /* Holds the default callbacks */
145 struct netmap_bdg_ops vale_bdg_ops = {
146 .lookup = netmap_vale_learning,
147 .config = NULL,
148 .dtor = NULL,
149 .vp_create = netmap_vale_vp_create,
150 .bwrap_attach = netmap_vale_bwrap_attach,
151 .name = NM_BDG_NAME,
152 };
153
154 /*
155 * this is a slightly optimized copy routine which rounds
156 * to multiple of 64 bytes and is often faster than dealing
157 * with other odd sizes. We assume there is enough room
158 * in the source and destination buffers.
159 *
160 * XXX only for multiples of 64 bytes, non overlapped.
161 */
162 static inline void
pkt_copy(void * _src,void * _dst,int l)163 pkt_copy(void *_src, void *_dst, int l)
164 {
165 uint64_t *src = _src;
166 uint64_t *dst = _dst;
167 if (unlikely(l >= 1024)) {
168 memcpy(dst, src, l);
169 return;
170 }
171 for (; likely(l > 0); l-=64) {
172 *dst++ = *src++;
173 *dst++ = *src++;
174 *dst++ = *src++;
175 *dst++ = *src++;
176 *dst++ = *src++;
177 *dst++ = *src++;
178 *dst++ = *src++;
179 *dst++ = *src++;
180 }
181 }
182
183
184 /*
185 * Free the forwarding tables for rings attached to switch ports.
186 */
187 static void
nm_free_bdgfwd(struct netmap_adapter * na)188 nm_free_bdgfwd(struct netmap_adapter *na)
189 {
190 int nrings, i;
191 struct netmap_kring **kring;
192
193 NMG_LOCK_ASSERT();
194 nrings = na->num_tx_rings;
195 kring = na->tx_rings;
196 for (i = 0; i < nrings; i++) {
197 if (kring[i]->nkr_ft) {
198 nm_os_free(kring[i]->nkr_ft);
199 kring[i]->nkr_ft = NULL; /* protect from freeing twice */
200 }
201 }
202 }
203
204
205 /*
206 * Allocate the forwarding tables for the rings attached to the bridge ports.
207 */
208 static int
nm_alloc_bdgfwd(struct netmap_adapter * na)209 nm_alloc_bdgfwd(struct netmap_adapter *na)
210 {
211 int nrings, l, i, num_dstq;
212 struct netmap_kring **kring;
213
214 NMG_LOCK_ASSERT();
215 /* all port:rings + broadcast */
216 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
217 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
218 l += sizeof(struct nm_vale_q) * num_dstq;
219 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
220
221 nrings = netmap_real_rings(na, NR_TX);
222 kring = na->tx_rings;
223 for (i = 0; i < nrings; i++) {
224 struct nm_bdg_fwd *ft;
225 struct nm_vale_q *dstq;
226 int j;
227
228 ft = nm_os_malloc(l);
229 if (!ft) {
230 nm_free_bdgfwd(na);
231 return ENOMEM;
232 }
233 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
234 for (j = 0; j < num_dstq; j++) {
235 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
236 dstq[j].bq_len = 0;
237 }
238 kring[i]->nkr_ft = ft;
239 }
240 return 0;
241 }
242
243 /* Allows external modules to create bridges in exclusive mode,
244 * returns an authentication token that the external module will need
245 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
246 * and nm_bdg_update_private_data() operations.
247 * Successfully executed if ret != NULL and *return_status == 0.
248 */
249 void *
netmap_vale_create(const char * bdg_name,int * return_status)250 netmap_vale_create(const char *bdg_name, int *return_status)
251 {
252 struct nm_bridge *b = NULL;
253 void *ret = NULL;
254
255 NMG_LOCK();
256 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
257 if (b) {
258 *return_status = EEXIST;
259 goto unlock_bdg_create;
260 }
261
262 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
263 if (!b) {
264 *return_status = ENOMEM;
265 goto unlock_bdg_create;
266 }
267
268 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
269 ret = nm_bdg_get_auth_token(b);
270 *return_status = 0;
271
272 unlock_bdg_create:
273 NMG_UNLOCK();
274 return ret;
275 }
276
277 /* Allows external modules to destroy a bridge created through
278 * netmap_bdg_create(), the bridge must be empty.
279 */
280 int
netmap_vale_destroy(const char * bdg_name,void * auth_token)281 netmap_vale_destroy(const char *bdg_name, void *auth_token)
282 {
283 struct nm_bridge *b = NULL;
284 int ret = 0;
285
286 NMG_LOCK();
287 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
288 if (!b) {
289 ret = ENXIO;
290 goto unlock_bdg_free;
291 }
292
293 if (!nm_bdg_valid_auth_token(b, auth_token)) {
294 ret = EACCES;
295 goto unlock_bdg_free;
296 }
297 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
298 ret = EINVAL;
299 goto unlock_bdg_free;
300 }
301
302 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
303 ret = netmap_bdg_free(b);
304 if (ret) {
305 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
306 }
307
308 unlock_bdg_free:
309 NMG_UNLOCK();
310 return ret;
311 }
312
313 /* Process NETMAP_REQ_VALE_LIST. */
314 int
netmap_vale_list(struct nmreq_header * hdr)315 netmap_vale_list(struct nmreq_header *hdr)
316 {
317 struct nmreq_vale_list *req =
318 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
319 int namelen = strlen(hdr->nr_name);
320 struct nm_bridge *b, *bridges;
321 struct netmap_vp_adapter *vpna;
322 int error = 0, i, j;
323 u_int num_bridges;
324
325 netmap_bns_getbridges(&bridges, &num_bridges);
326
327 /* this is used to enumerate bridges and ports */
328 if (namelen) { /* look up indexes of bridge and port */
329 if (strncmp(hdr->nr_name, NM_BDG_NAME,
330 strlen(NM_BDG_NAME))) {
331 return EINVAL;
332 }
333 NMG_LOCK();
334 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
335 if (!b) {
336 NMG_UNLOCK();
337 return ENOENT;
338 }
339
340 req->nr_bridge_idx = b - bridges; /* bridge index */
341 req->nr_port_idx = NM_BDG_NOPORT;
342 for (j = 0; j < b->bdg_active_ports; j++) {
343 i = b->bdg_port_index[j];
344 vpna = b->bdg_ports[i];
345 if (vpna == NULL) {
346 nm_prerr("This should not happen");
347 continue;
348 }
349 /* the former and the latter identify a
350 * virtual port and a NIC, respectively
351 */
352 if (!strcmp(vpna->up.name, hdr->nr_name)) {
353 req->nr_port_idx = i; /* port index */
354 break;
355 }
356 }
357 NMG_UNLOCK();
358 } else {
359 /* return the first non-empty entry starting from
360 * bridge nr_arg1 and port nr_arg2.
361 *
362 * Users can detect the end of the same bridge by
363 * seeing the new and old value of nr_arg1, and can
364 * detect the end of all the bridge by error != 0
365 */
366 i = req->nr_bridge_idx;
367 j = req->nr_port_idx;
368
369 NMG_LOCK();
370 for (error = ENOENT; i < vale_max_bridges; i++) {
371 b = bridges + i;
372 for ( ; j < NM_BDG_MAXPORTS; j++) {
373 if (b->bdg_ports[j] == NULL)
374 continue;
375 vpna = b->bdg_ports[j];
376 /* write back the VALE switch name */
377 strlcpy(hdr->nr_name, vpna->up.name,
378 sizeof(hdr->nr_name));
379 error = 0;
380 goto out;
381 }
382 j = 0; /* following bridges scan from 0 */
383 }
384 out:
385 req->nr_bridge_idx = i;
386 req->nr_port_idx = j;
387 NMG_UNLOCK();
388 }
389
390 return error;
391 }
392
393 /* Process NETMAP_REQ_VALE_ATTACH.
394 */
395 int
netmap_vale_attach(struct nmreq_header * hdr,void * auth_token)396 netmap_vale_attach(struct nmreq_header *hdr, void *auth_token)
397 {
398 struct nmreq_vale_attach *req =
399 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
400 struct netmap_vp_adapter * vpna;
401 struct netmap_adapter *na = NULL;
402 struct netmap_mem_d *nmd = NULL;
403 struct nm_bridge *b = NULL;
404 int error;
405
406 NMG_LOCK();
407 /* permission check for modified bridges */
408 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
409 if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
410 error = EACCES;
411 goto unlock_exit;
412 }
413
414 if (req->reg.nr_mem_id) {
415 nmd = netmap_mem_find(req->reg.nr_mem_id);
416 if (nmd == NULL) {
417 error = EINVAL;
418 goto unlock_exit;
419 }
420 }
421
422 /* check for existing one */
423 error = netmap_get_vale_na(hdr, &na, nmd, 0);
424 if (na) {
425 error = EBUSY;
426 goto unref_exit;
427 }
428 error = netmap_get_vale_na(hdr, &na,
429 nmd, 1 /* create if not exists */);
430 if (error) { /* no device */
431 goto unlock_exit;
432 }
433
434 if (na == NULL) { /* VALE prefix missing */
435 error = EINVAL;
436 goto unlock_exit;
437 }
438
439 if (NETMAP_OWNED_BY_ANY(na)) {
440 error = EBUSY;
441 goto unref_exit;
442 }
443
444 if (na->nm_bdg_ctl) {
445 /* nop for VALE ports. The bwrap needs to put the hwna
446 * in netmap mode (see netmap_bwrap_bdg_ctl)
447 */
448 error = na->nm_bdg_ctl(hdr, na);
449 if (error)
450 goto unref_exit;
451 nm_prdis("registered %s to netmap-mode", na->name);
452 }
453 vpna = (struct netmap_vp_adapter *)na;
454 req->port_index = vpna->bdg_port;
455
456 if (nmd)
457 netmap_mem_put(nmd);
458
459 NMG_UNLOCK();
460 return 0;
461
462 unref_exit:
463 netmap_adapter_put(na);
464 unlock_exit:
465 if (nmd)
466 netmap_mem_put(nmd);
467
468 NMG_UNLOCK();
469 return error;
470 }
471
472 /* Process NETMAP_REQ_VALE_DETACH.
473 */
474 int
netmap_vale_detach(struct nmreq_header * hdr,void * auth_token)475 netmap_vale_detach(struct nmreq_header *hdr, void *auth_token)
476 {
477 struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
478 struct netmap_vp_adapter *vpna;
479 struct netmap_adapter *na;
480 struct nm_bridge *b = NULL;
481 int error;
482
483 NMG_LOCK();
484 /* permission check for modified bridges */
485 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
486 if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
487 error = EACCES;
488 goto unlock_exit;
489 }
490
491 error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
492 if (error) { /* no device, or another bridge or user owns the device */
493 goto unlock_exit;
494 }
495
496 if (na == NULL) { /* VALE prefix missing */
497 error = EINVAL;
498 goto unlock_exit;
499 } else if (nm_is_bwrap(na) &&
500 ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
501 /* Don't detach a NIC with polling */
502 error = EBUSY;
503 goto unref_exit;
504 }
505
506 vpna = (struct netmap_vp_adapter *)na;
507 if (na->na_vp != vpna) {
508 /* trying to detach first attach of VALE persistent port attached
509 * to 2 bridges
510 */
511 error = EBUSY;
512 goto unref_exit;
513 }
514 nmreq_det->port_index = vpna->bdg_port;
515
516 if (na->nm_bdg_ctl) {
517 /* remove the port from bridge. The bwrap
518 * also needs to put the hwna in normal mode
519 */
520 error = na->nm_bdg_ctl(hdr, na);
521 }
522
523 unref_exit:
524 netmap_adapter_put(na);
525 unlock_exit:
526 NMG_UNLOCK();
527 return error;
528
529 }
530
531
532 /* nm_dtor callback for ephemeral VALE ports */
533 static void
netmap_vale_vp_dtor(struct netmap_adapter * na)534 netmap_vale_vp_dtor(struct netmap_adapter *na)
535 {
536 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
537 struct nm_bridge *b = vpna->na_bdg;
538
539 nm_prdis("%s has %d references", na->name, na->na_refcount);
540
541 if (b) {
542 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
543 }
544
545 if (na->ifp != NULL && !nm_iszombie(na)) {
546 NM_DETACH_NA(na->ifp);
547 if (vpna->autodelete) {
548 nm_prdis("releasing %s", na->ifp->if_xname);
549 NMG_UNLOCK();
550 nm_os_vi_detach(na->ifp);
551 NMG_LOCK();
552 }
553 }
554 }
555
556
557
558 /* nm_krings_create callback for VALE ports.
559 * Calls the standard netmap_krings_create, then adds leases on rx
560 * rings and bdgfwd on tx rings.
561 */
562 static int
netmap_vale_vp_krings_create(struct netmap_adapter * na)563 netmap_vale_vp_krings_create(struct netmap_adapter *na)
564 {
565 u_int tailroom;
566 int error, i;
567 uint32_t *leases;
568 u_int nrx = netmap_real_rings(na, NR_RX);
569
570 /*
571 * Leases are attached to RX rings on vale ports
572 */
573 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
574
575 error = netmap_krings_create(na, tailroom);
576 if (error)
577 return error;
578
579 leases = na->tailroom;
580
581 for (i = 0; i < nrx; i++) { /* Receive rings */
582 na->rx_rings[i]->nkr_leases = leases;
583 leases += na->num_rx_desc;
584 }
585
586 error = nm_alloc_bdgfwd(na);
587 if (error) {
588 netmap_krings_delete(na);
589 return error;
590 }
591
592 return 0;
593 }
594
595
596 /* nm_krings_delete callback for VALE ports. */
597 static void
netmap_vale_vp_krings_delete(struct netmap_adapter * na)598 netmap_vale_vp_krings_delete(struct netmap_adapter *na)
599 {
600 nm_free_bdgfwd(na);
601 netmap_krings_delete(na);
602 }
603
604
605 static int
606 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n,
607 struct netmap_vp_adapter *na, u_int ring_nr);
608
609
610 /*
611 * main dispatch routine for the bridge.
612 * Grab packets from a kring, move them into the ft structure
613 * associated to the tx (input) port. Max one instance per port,
614 * filtered on input (ioctl, poll or XXX).
615 * Returns the next position in the ring.
616 */
617 static int
nm_vale_preflush(struct netmap_kring * kring,u_int end)618 nm_vale_preflush(struct netmap_kring *kring, u_int end)
619 {
620 struct netmap_vp_adapter *na =
621 (struct netmap_vp_adapter*)kring->na;
622 struct netmap_ring *ring = kring->ring;
623 struct nm_bdg_fwd *ft;
624 u_int ring_nr = kring->ring_id;
625 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
626 u_int ft_i = 0; /* start from 0 */
627 u_int frags = 1; /* how many frags ? */
628 struct nm_bridge *b = na->na_bdg;
629
630 /* To protect against modifications to the bridge we acquire a
631 * shared lock, waiting if we can sleep (if the source port is
632 * attached to a user process) or with a trylock otherwise (NICs).
633 */
634 nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
635 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
636 BDG_RLOCK(b);
637 else if (!BDG_RTRYLOCK(b))
638 return j;
639 nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
640 ft = kring->nkr_ft;
641
642 for (; likely(j != end); j = nm_next(j, lim)) {
643 struct netmap_slot *slot = &ring->slot[j];
644 char *buf;
645
646 ft[ft_i].ft_len = slot->len;
647 ft[ft_i].ft_flags = slot->flags;
648 ft[ft_i].ft_offset = 0;
649
650 nm_prdis("flags is 0x%x", slot->flags);
651 /* we do not use the buf changed flag, but we still need to reset it */
652 slot->flags &= ~NS_BUF_CHANGED;
653
654 /* this slot goes into a list so initialize the link field */
655 ft[ft_i].ft_next = NM_FT_NULL;
656 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
657 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
658 if (unlikely(buf == NULL)) {
659 nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d",
660 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
661 kring->name, j, ft[ft_i].ft_len);
662 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
663 ft[ft_i].ft_len = 0;
664 ft[ft_i].ft_flags = 0;
665 }
666 __builtin_prefetch(buf);
667 ++ft_i;
668 if (slot->flags & NS_MOREFRAG) {
669 frags++;
670 continue;
671 }
672 if (unlikely(netmap_verbose && frags > 1))
673 nm_prlim(5, "%d frags at %d", frags, ft_i - frags);
674 ft[ft_i - frags].ft_frags = frags;
675 frags = 1;
676 if (unlikely((int)ft_i >= bridge_batch))
677 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
678 }
679 if (frags > 1) {
680 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
681 * have to fix frags count. */
682 frags--;
683 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
684 ft[ft_i - frags].ft_frags = frags;
685 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
686 }
687 if (ft_i)
688 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
689 BDG_RUNLOCK(b);
690 return j;
691 }
692
693
694 /* ----- FreeBSD if_bridge hash function ------- */
695
696 /*
697 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
698 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
699 *
700 * http://www.burtleburtle.net/bob/hash/spooky.html
701 */
702 #define mix(a, b, c) \
703 do { \
704 a -= b; a -= c; a ^= (c >> 13); \
705 b -= c; b -= a; b ^= (a << 8); \
706 c -= a; c -= b; c ^= (b >> 13); \
707 a -= b; a -= c; a ^= (c >> 12); \
708 b -= c; b -= a; b ^= (a << 16); \
709 c -= a; c -= b; c ^= (b >> 5); \
710 a -= b; a -= c; a ^= (c >> 3); \
711 b -= c; b -= a; b ^= (a << 10); \
712 c -= a; c -= b; c ^= (b >> 15); \
713 } while (/*CONSTCOND*/0)
714
715
716 static __inline uint32_t
nm_vale_rthash(const uint8_t * addr)717 nm_vale_rthash(const uint8_t *addr)
718 {
719 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hash key
720
721 b += addr[5] << 8;
722 b += addr[4];
723 a += addr[3] << 24;
724 a += addr[2] << 16;
725 a += addr[1] << 8;
726 a += addr[0];
727
728 mix(a, b, c);
729 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
730 return (c & BRIDGE_RTHASH_MASK);
731 }
732
733 #undef mix
734
735
736 /*
737 * Lookup function for a learning bridge.
738 * Update the hash table with the source address,
739 * and then returns the destination port index, and the
740 * ring in *dst_ring (at the moment, always use ring 0)
741 */
742 uint32_t
netmap_vale_learning(struct nm_bdg_fwd * ft,uint8_t * dst_ring,struct netmap_vp_adapter * na,void * private_data)743 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
744 struct netmap_vp_adapter *na, void *private_data)
745 {
746 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
747 u_int buf_len = ft->ft_len - ft->ft_offset;
748 struct nm_hash_ent *ht = private_data;
749 uint32_t sh, dh;
750 u_int dst, mysrc = na->bdg_port;
751 uint64_t smac, dmac;
752 uint8_t indbuf[12];
753
754 if (buf_len < 14) {
755 return NM_BDG_NOPORT;
756 }
757
758 if (ft->ft_flags & NS_INDIRECT) {
759 if (copyin(buf, indbuf, sizeof(indbuf))) {
760 return NM_BDG_NOPORT;
761 }
762 buf = indbuf;
763 }
764
765 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
766 smac = le64toh(*(uint64_t *)(buf + 4));
767 smac >>= 16;
768
769 /*
770 * The hash is somewhat expensive, there might be some
771 * worthwhile optimizations here.
772 */
773 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
774 uint8_t *s = buf+6;
775 sh = nm_vale_rthash(s); /* hash of source */
776 /* update source port forwarding entry */
777 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */
778 ht[sh].ports = mysrc;
779 if (netmap_debug & NM_DEBUG_VALE)
780 nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
781 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
782 }
783 dst = NM_BDG_BROADCAST;
784 if ((buf[0] & 1) == 0) { /* unicast */
785 dh = nm_vale_rthash(buf); /* hash of dst */
786 if (ht[dh].mac == dmac) { /* found dst */
787 dst = ht[dh].ports;
788 }
789 }
790 return dst;
791 }
792
793
794 /*
795 * Available space in the ring. Only used in VALE code
796 * and only with is_rx = 1
797 */
798 static inline uint32_t
nm_kr_space(struct netmap_kring * k,int is_rx)799 nm_kr_space(struct netmap_kring *k, int is_rx)
800 {
801 int space;
802
803 if (is_rx) {
804 int busy = k->nkr_hwlease - k->nr_hwcur;
805 if (busy < 0)
806 busy += k->nkr_num_slots;
807 space = k->nkr_num_slots - 1 - busy;
808 } else {
809 /* XXX never used in this branch */
810 space = k->nr_hwtail - k->nkr_hwlease;
811 if (space < 0)
812 space += k->nkr_num_slots;
813 }
814 #if 0
815 // sanity check
816 if (k->nkr_hwlease >= k->nkr_num_slots ||
817 k->nr_hwcur >= k->nkr_num_slots ||
818 k->nr_tail >= k->nkr_num_slots ||
819 busy < 0 ||
820 busy >= k->nkr_num_slots) {
821 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",
822 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
823 k->nkr_lease_idx, k->nkr_num_slots);
824 }
825 #endif
826 return space;
827 }
828
829
830
831
832 /* make a lease on the kring for N positions. return the
833 * lease index
834 * XXX only used in VALE code and with is_rx = 1
835 */
836 static inline uint32_t
nm_kr_lease(struct netmap_kring * k,u_int n,int is_rx)837 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
838 {
839 uint32_t lim = k->nkr_num_slots - 1;
840 uint32_t lease_idx = k->nkr_lease_idx;
841
842 k->nkr_leases[lease_idx] = NR_NOSLOT;
843 k->nkr_lease_idx = nm_next(lease_idx, lim);
844
845 #ifdef CONFIG_NETMAP_DEBUG
846 if (n > nm_kr_space(k, is_rx)) {
847 nm_prerr("invalid request for %d slots", n);
848 panic("x");
849 }
850 #endif /* CONFIG NETMAP_DEBUG */
851 /* XXX verify that there are n slots */
852 k->nkr_hwlease += n;
853 if (k->nkr_hwlease > lim)
854 k->nkr_hwlease -= lim + 1;
855
856 #ifdef CONFIG_NETMAP_DEBUG
857 if (k->nkr_hwlease >= k->nkr_num_slots ||
858 k->nr_hwcur >= k->nkr_num_slots ||
859 k->nr_hwtail >= k->nkr_num_slots ||
860 k->nkr_lease_idx >= k->nkr_num_slots) {
861 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
862 k->na->name,
863 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
864 k->nkr_lease_idx, k->nkr_num_slots);
865 }
866 #endif /* CONFIG_NETMAP_DEBUG */
867 return lease_idx;
868 }
869
870 /*
871 *
872 * This flush routine supports only unicast and broadcast but a large
873 * number of ports, and lets us replace the learn and dispatch functions.
874 */
875 int
nm_vale_flush(struct nm_bdg_fwd * ft,u_int n,struct netmap_vp_adapter * na,u_int ring_nr)876 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
877 u_int ring_nr)
878 {
879 struct nm_vale_q *dst_ents, *brddst;
880 uint16_t num_dsts = 0, *dsts;
881 struct nm_bridge *b = na->na_bdg;
882 u_int i, me = na->bdg_port;
883
884 /*
885 * The work area (pointed by ft) is followed by an array of
886 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
887 * queues per port plus one for the broadcast traffic.
888 * Then we have an array of destination indexes.
889 */
890 dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
891 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
892
893 /* first pass: find a destination for each packet in the batch */
894 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
895 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
896 uint16_t dst_port, d_i;
897 struct nm_vale_q *d;
898 struct nm_bdg_fwd *start_ft = NULL;
899
900 nm_prdis("slot %d frags %d", i, ft[i].ft_frags);
901
902 if (na->up.virt_hdr_len < ft[i].ft_len) {
903 ft[i].ft_offset = na->up.virt_hdr_len;
904 start_ft = &ft[i];
905 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
906 ft[i].ft_offset = ft[i].ft_len;
907 start_ft = &ft[i+1];
908 } else {
909 /* Drop the packet if the virtio-net header is not into the first
910 * fragment nor at the very beginning of the second.
911 */
912 continue;
913 }
914 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data);
915 if (netmap_verbose > 255)
916 nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port);
917 if (dst_port >= NM_BDG_NOPORT)
918 continue; /* this packet is identified to be dropped */
919 else if (dst_port == NM_BDG_BROADCAST)
920 dst_ring = 0; /* broadcasts always go to ring 0 */
921 else if (unlikely(dst_port == me ||
922 !b->bdg_ports[dst_port]))
923 continue;
924
925 /* get a position in the scratch pad */
926 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
927 d = dst_ents + d_i;
928
929 /* append the first fragment to the list */
930 if (d->bq_head == NM_FT_NULL) { /* new destination */
931 d->bq_head = d->bq_tail = i;
932 /* remember this position to be scanned later */
933 if (dst_port != NM_BDG_BROADCAST)
934 dsts[num_dsts++] = d_i;
935 } else {
936 ft[d->bq_tail].ft_next = i;
937 d->bq_tail = i;
938 }
939 d->bq_len += ft[i].ft_frags;
940 }
941
942 /*
943 * Broadcast traffic goes to ring 0 on all destinations.
944 * So we need to add these rings to the list of ports to scan.
945 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
946 * expensive. We should keep a compact list of active destinations
947 * so we could shorten this loop.
948 */
949 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
950 if (brddst->bq_head != NM_FT_NULL) {
951 u_int j;
952 for (j = 0; likely(j < b->bdg_active_ports); j++) {
953 uint16_t d_i;
954 i = b->bdg_port_index[j];
955 if (unlikely(i == me))
956 continue;
957 d_i = i * NM_BDG_MAXRINGS;
958 if (dst_ents[d_i].bq_head == NM_FT_NULL)
959 dsts[num_dsts++] = d_i;
960 }
961 }
962
963 nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
964 /* second pass: scan destinations */
965 for (i = 0; i < num_dsts; i++) {
966 struct netmap_vp_adapter *dst_na;
967 struct netmap_kring *kring;
968 struct netmap_ring *ring;
969 u_int dst_nr, lim, j, d_i, next, brd_next;
970 u_int needed, howmany;
971 int retry = netmap_txsync_retry;
972 struct nm_vale_q *d;
973 uint32_t my_start = 0, lease_idx = 0;
974 int nrings;
975 int virt_hdr_mismatch = 0;
976
977 d_i = dsts[i];
978 nm_prdis("second pass %d port %d", i, d_i);
979 d = dst_ents + d_i;
980 // XXX fix the division
981 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
982 /* protect from the lookup function returning an inactive
983 * destination port
984 */
985 if (unlikely(dst_na == NULL))
986 goto cleanup;
987 if (dst_na->up.na_flags & NAF_SW_ONLY)
988 goto cleanup;
989 /*
990 * The interface may be in !netmap mode in two cases:
991 * - when na is attached but not activated yet;
992 * - when na is being deactivated but is still attached.
993 */
994 if (unlikely(!nm_netmap_on(&dst_na->up))) {
995 nm_prdis("not in netmap mode!");
996 goto cleanup;
997 }
998
999 /* there is at least one either unicast or broadcast packet */
1000 brd_next = brddst->bq_head;
1001 next = d->bq_head;
1002 /* we need to reserve this many slots. If fewer are
1003 * available, some packets will be dropped.
1004 * Packets may have multiple fragments, so we may not use
1005 * there is a chance that we may not use all of the slots
1006 * we have claimed, so we will need to handle the leftover
1007 * ones when we regain the lock.
1008 */
1009 needed = d->bq_len + brddst->bq_len;
1010
1011 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1012 if (netmap_verbose) {
1013 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1014 dst_na->up.virt_hdr_len);
1015 }
1016 /* There is a virtio-net header/offloadings mismatch between
1017 * source and destination. The slower mismatch datapath will
1018 * be used to cope with all the mismatches.
1019 */
1020 virt_hdr_mismatch = 1;
1021 if (dst_na->mfs < na->mfs) {
1022 /* We may need to do segmentation offloadings, and so
1023 * we may need a number of destination slots greater
1024 * than the number of input slots ('needed').
1025 * We look for the smallest integer 'x' which satisfies:
1026 * needed * na->mfs + x * H <= x * na->mfs
1027 * where 'H' is the length of the longest header that may
1028 * be replicated in the segmentation process (e.g. for
1029 * TCPv4 we must account for ethernet header, IP header
1030 * and TCPv4 header).
1031 */
1032 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
1033 needed = (needed * na->mfs) /
1034 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1035 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1036 }
1037 }
1038
1039 nm_prdis(5, "pass 2 dst %d is %x %s",
1040 i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual");
1041 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1042 nrings = dst_na->up.num_rx_rings;
1043 if (dst_nr >= nrings)
1044 dst_nr = dst_nr % nrings;
1045 kring = dst_na->up.rx_rings[dst_nr];
1046 ring = kring->ring;
1047 /* the destination ring may have not been opened for RX */
1048 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
1049 goto cleanup;
1050 lim = kring->nkr_num_slots - 1;
1051
1052 retry:
1053
1054 if (dst_na->retry && retry) {
1055 /* try to get some free slot from the previous run */
1056 kring->nm_notify(kring, NAF_FORCE_RECLAIM);
1057 /* actually useful only for bwraps, since there
1058 * the notify will trigger a txsync on the hwna. VALE ports
1059 * have dst_na->retry == 0
1060 */
1061 }
1062 /* reserve the buffers in the queue and an entry
1063 * to report completion, and drop lock.
1064 * XXX this might become a helper function.
1065 */
1066 mtx_lock(&kring->q_lock);
1067 if (kring->nkr_stopped) {
1068 mtx_unlock(&kring->q_lock);
1069 goto cleanup;
1070 }
1071 my_start = j = kring->nkr_hwlease;
1072 howmany = nm_kr_space(kring, 1);
1073 if (needed < howmany)
1074 howmany = needed;
1075 lease_idx = nm_kr_lease(kring, howmany, 1);
1076 mtx_unlock(&kring->q_lock);
1077
1078 /* only retry if we need more than available slots */
1079 if (retry && needed <= howmany)
1080 retry = 0;
1081
1082 /* copy to the destination queue */
1083 while (howmany > 0) {
1084 struct netmap_slot *slot;
1085 struct nm_bdg_fwd *ft_p, *ft_end;
1086 u_int cnt;
1087
1088 /* find the queue from which we pick next packet.
1089 * NM_FT_NULL is always higher than valid indexes
1090 * so we never dereference it if the other list
1091 * has packets (and if both are empty we never
1092 * get here).
1093 */
1094 if (next < brd_next) {
1095 ft_p = ft + next;
1096 next = ft_p->ft_next;
1097 } else { /* insert broadcast */
1098 ft_p = ft + brd_next;
1099 brd_next = ft_p->ft_next;
1100 }
1101 cnt = ft_p->ft_frags; // cnt > 0
1102 if (unlikely(cnt > howmany))
1103 break; /* no more space */
1104 if (netmap_verbose && cnt > 1)
1105 nm_prlim(5, "rx %d frags to %d", cnt, j);
1106 ft_end = ft_p + cnt;
1107 if (unlikely(virt_hdr_mismatch)) {
1108 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1109 } else {
1110 howmany -= cnt;
1111 do {
1112 char *dst, *src = ft_p->ft_buf;
1113 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1114
1115 slot = &ring->slot[j];
1116 dst = NMB(&dst_na->up, slot);
1117
1118 nm_prdis("send [%d] %d(%d) bytes at %s:%d",
1119 i, (int)copy_len, (int)dst_len,
1120 dst_na->up.name, j);
1121 /* round to a multiple of 64 */
1122 copy_len = (copy_len + 63) & ~63;
1123
1124 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1125 copy_len > NETMAP_BUF_SIZE(&na->up))) {
1126 nm_prlim(5, "invalid len %d, down to 64", (int)copy_len);
1127 copy_len = dst_len = 64; // XXX
1128 }
1129 if (ft_p->ft_flags & NS_INDIRECT) {
1130 if (copyin(src, dst, copy_len)) {
1131 // invalid user pointer, pretend len is 0
1132 dst_len = 0;
1133 }
1134 } else {
1135 //memcpy(dst, src, copy_len);
1136 pkt_copy(src, dst, (int)copy_len);
1137 }
1138 slot->len = dst_len;
1139 slot->flags = (cnt << 8)| NS_MOREFRAG;
1140 j = nm_next(j, lim);
1141 needed--;
1142 ft_p++;
1143 } while (ft_p != ft_end);
1144 slot->flags = (cnt << 8); /* clear flag on last entry */
1145 }
1146 /* are we done ? */
1147 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1148 break;
1149 }
1150 {
1151 /* current position */
1152 uint32_t *p = kring->nkr_leases; /* shorthand */
1153 uint32_t update_pos;
1154 int still_locked = 1;
1155
1156 mtx_lock(&kring->q_lock);
1157 if (unlikely(howmany > 0)) {
1158 /* not used all bufs. If i am the last one
1159 * i can recover the slots, otherwise must
1160 * fill them with 0 to mark empty packets.
1161 */
1162 nm_prdis("leftover %d bufs", howmany);
1163 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1164 /* yes i am the last one */
1165 nm_prdis("roll back nkr_hwlease to %d", j);
1166 kring->nkr_hwlease = j;
1167 } else {
1168 while (howmany-- > 0) {
1169 ring->slot[j].len = 0;
1170 ring->slot[j].flags = 0;
1171 j = nm_next(j, lim);
1172 }
1173 }
1174 }
1175 p[lease_idx] = j; /* report I am done */
1176
1177 update_pos = kring->nr_hwtail;
1178
1179 if (my_start == update_pos) {
1180 /* all slots before my_start have been reported,
1181 * so scan subsequent leases to see if other ranges
1182 * have been completed, and to a selwakeup or txsync.
1183 */
1184 while (lease_idx != kring->nkr_lease_idx &&
1185 p[lease_idx] != NR_NOSLOT) {
1186 j = p[lease_idx];
1187 p[lease_idx] = NR_NOSLOT;
1188 lease_idx = nm_next(lease_idx, lim);
1189 }
1190 /* j is the new 'write' position. j != my_start
1191 * means there are new buffers to report
1192 */
1193 if (likely(j != my_start)) {
1194 kring->nr_hwtail = j;
1195 still_locked = 0;
1196 mtx_unlock(&kring->q_lock);
1197 kring->nm_notify(kring, 0);
1198 /* this is netmap_notify for VALE ports and
1199 * netmap_bwrap_notify for bwrap. The latter will
1200 * trigger a txsync on the underlying hwna
1201 */
1202 if (dst_na->retry && retry--) {
1203 /* XXX this is going to call nm_notify again.
1204 * Only useful for bwrap in virtual machines
1205 */
1206 goto retry;
1207 }
1208 }
1209 }
1210 if (still_locked)
1211 mtx_unlock(&kring->q_lock);
1212 }
1213 cleanup:
1214 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1215 d->bq_len = 0;
1216 }
1217 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1218 brddst->bq_len = 0;
1219 return 0;
1220 }
1221
1222 /* nm_txsync callback for VALE ports */
1223 static int
netmap_vale_vp_txsync(struct netmap_kring * kring,int flags)1224 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags)
1225 {
1226 struct netmap_vp_adapter *na =
1227 (struct netmap_vp_adapter *)kring->na;
1228 u_int done;
1229 u_int const lim = kring->nkr_num_slots - 1;
1230 u_int const head = kring->rhead;
1231
1232 if (bridge_batch <= 0) { /* testing only */
1233 done = head; // used all
1234 goto done;
1235 }
1236 if (!na->na_bdg) {
1237 done = head;
1238 goto done;
1239 }
1240 if (bridge_batch > NM_BDG_BATCH)
1241 bridge_batch = NM_BDG_BATCH;
1242
1243 done = nm_vale_preflush(kring, head);
1244 done:
1245 if (done != head)
1246 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1247 /*
1248 * packets between 'done' and 'cur' are left unsent.
1249 */
1250 kring->nr_hwcur = done;
1251 kring->nr_hwtail = nm_prev(done, lim);
1252 if (netmap_debug & NM_DEBUG_TXSYNC)
1253 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1254 return 0;
1255 }
1256
1257
1258 /* create a netmap_vp_adapter that describes a VALE port.
1259 * Only persistent VALE ports have a non-null ifp.
1260 */
1261 static int
netmap_vale_vp_create(struct nmreq_header * hdr,struct ifnet * ifp,struct netmap_mem_d * nmd,struct netmap_vp_adapter ** ret)1262 netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
1263 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1264 {
1265 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1266 struct netmap_vp_adapter *vpna;
1267 struct netmap_adapter *na;
1268 int error = 0;
1269 u_int npipes = 0;
1270 u_int extrabufs = 0;
1271
1272 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1273 return EINVAL;
1274 }
1275
1276 vpna = nm_os_malloc(sizeof(*vpna));
1277 if (vpna == NULL)
1278 return ENOMEM;
1279
1280 na = &vpna->up;
1281
1282 na->ifp = ifp;
1283 strlcpy(na->name, hdr->nr_name, sizeof(na->name));
1284
1285 /* bound checking */
1286 na->num_tx_rings = req->nr_tx_rings;
1287 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1288 req->nr_tx_rings = na->num_tx_rings; /* write back */
1289 na->num_rx_rings = req->nr_rx_rings;
1290 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1291 req->nr_rx_rings = na->num_rx_rings; /* write back */
1292 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1293 1, NM_BDG_MAXSLOTS, NULL);
1294 na->num_tx_desc = req->nr_tx_slots;
1295 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1296 1, NM_BDG_MAXSLOTS, NULL);
1297 /* validate number of pipes. We want at least 1,
1298 * but probably can do with some more.
1299 * So let's use 2 as default (when 0 is supplied)
1300 */
1301 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1302 /* validate extra bufs */
1303 extrabufs = req->nr_extra_bufs;
1304 nm_bound_var(&extrabufs, 0, 0,
1305 128*NM_BDG_MAXSLOTS, NULL);
1306 req->nr_extra_bufs = extrabufs; /* write back */
1307 na->num_rx_desc = req->nr_rx_slots;
1308 /* Set the mfs to a default value, as it is needed on the VALE
1309 * mismatch datapath. XXX We should set it according to the MTU
1310 * known to the kernel. */
1311 vpna->mfs = NM_BDG_MFS_DEFAULT;
1312 vpna->last_smac = ~0llu;
1313 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
1314 vpna->mfs = netmap_buf_size; */
1315 if (netmap_verbose)
1316 nm_prinf("max frame size %u", vpna->mfs);
1317
1318 na->na_flags |= NAF_BDG_MAYSLEEP;
1319 /* persistent VALE ports look like hw devices
1320 * with a native netmap adapter
1321 */
1322 if (ifp)
1323 na->na_flags |= NAF_NATIVE;
1324 na->nm_txsync = netmap_vale_vp_txsync;
1325 na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */
1326 na->nm_register = netmap_vp_reg; /* use the one provided by bdg */
1327 na->nm_krings_create = netmap_vale_vp_krings_create;
1328 na->nm_krings_delete = netmap_vale_vp_krings_delete;
1329 na->nm_dtor = netmap_vale_vp_dtor;
1330 nm_prdis("nr_mem_id %d", req->nr_mem_id);
1331 na->nm_mem = nmd ?
1332 netmap_mem_get(nmd):
1333 netmap_mem_private_new(
1334 na->num_tx_rings, na->num_tx_desc,
1335 na->num_rx_rings, na->num_rx_desc,
1336 req->nr_extra_bufs, npipes, &error);
1337 if (na->nm_mem == NULL)
1338 goto err;
1339 na->nm_bdg_attach = netmap_vale_vp_bdg_attach;
1340 /* other nmd fields are set in the common routine */
1341 error = netmap_attach_common(na);
1342 if (error)
1343 goto err;
1344 *ret = vpna;
1345 return 0;
1346
1347 err:
1348 if (na->nm_mem != NULL)
1349 netmap_mem_put(na->nm_mem);
1350 nm_os_free(vpna);
1351 return error;
1352 }
1353
1354 /* nm_bdg_attach callback for VALE ports
1355 * The na_vp port is this same netmap_adapter. There is no host port.
1356 */
1357 static int
netmap_vale_vp_bdg_attach(const char * name,struct netmap_adapter * na,struct nm_bridge * b)1358 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1359 struct nm_bridge *b)
1360 {
1361 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1362
1363 if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) {
1364 return NM_NEED_BWRAP;
1365 }
1366 na->na_vp = vpna;
1367 strlcpy(na->name, name, sizeof(na->name));
1368 na->na_hostvp = NULL;
1369 return 0;
1370 }
1371
1372 static int
netmap_vale_bwrap_krings_create(struct netmap_adapter * na)1373 netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1374 {
1375 int error;
1376
1377 /* impersonate a netmap_vp_adapter */
1378 error = netmap_vale_vp_krings_create(na);
1379 if (error)
1380 return error;
1381 error = netmap_bwrap_krings_create_common(na);
1382 if (error) {
1383 netmap_vale_vp_krings_delete(na);
1384 }
1385 return error;
1386 }
1387
1388 static void
netmap_vale_bwrap_krings_delete(struct netmap_adapter * na)1389 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1390 {
1391 netmap_bwrap_krings_delete_common(na);
1392 netmap_vale_vp_krings_delete(na);
1393 }
1394
1395 static int
netmap_vale_bwrap_attach(const char * nr_name,struct netmap_adapter * hwna)1396 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1397 {
1398 struct netmap_bwrap_adapter *bna;
1399 struct netmap_adapter *na = NULL;
1400 struct netmap_adapter *hostna = NULL;
1401 int error;
1402
1403 bna = nm_os_malloc(sizeof(*bna));
1404 if (bna == NULL) {
1405 return ENOMEM;
1406 }
1407 na = &bna->up.up;
1408 strlcpy(na->name, nr_name, sizeof(na->name));
1409 na->nm_register = netmap_bwrap_reg;
1410 na->nm_txsync = netmap_vale_vp_txsync;
1411 // na->nm_rxsync = netmap_bwrap_rxsync;
1412 na->nm_krings_create = netmap_vale_bwrap_krings_create;
1413 na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1414 na->nm_notify = netmap_bwrap_notify;
1415 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1416 /* Set the mfs, needed on the VALE mismatch datapath. */
1417 bna->up.mfs = NM_BDG_MFS_DEFAULT;
1418
1419 if (hwna->na_flags & NAF_HOST_RINGS) {
1420 hostna = &bna->host.up;
1421 hostna->nm_notify = netmap_bwrap_notify;
1422 bna->host.mfs = NM_BDG_MFS_DEFAULT;
1423 }
1424
1425 error = netmap_bwrap_attach_common(na, hwna);
1426 if (error) {
1427 nm_os_free(bna);
1428 }
1429 return error;
1430 }
1431
1432 int
netmap_get_vale_na(struct nmreq_header * hdr,struct netmap_adapter ** na,struct netmap_mem_d * nmd,int create)1433 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1434 struct netmap_mem_d *nmd, int create)
1435 {
1436 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1437 }
1438
1439
1440 /* creates a persistent VALE port */
1441 int
nm_vi_create(struct nmreq_header * hdr)1442 nm_vi_create(struct nmreq_header *hdr)
1443 {
1444 struct nmreq_vale_newif *req =
1445 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1446 int error = 0;
1447 /* Build a nmreq_register out of the nmreq_vale_newif,
1448 * so that we can call netmap_get_bdg_na(). */
1449 struct nmreq_register regreq;
1450 bzero(®req, sizeof(regreq));
1451 regreq.nr_tx_slots = req->nr_tx_slots;
1452 regreq.nr_rx_slots = req->nr_rx_slots;
1453 regreq.nr_tx_rings = req->nr_tx_rings;
1454 regreq.nr_rx_rings = req->nr_rx_rings;
1455 regreq.nr_mem_id = req->nr_mem_id;
1456 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1457 hdr->nr_body = (uintptr_t)®req;
1458 error = netmap_vi_create(hdr, 0 /* no autodelete */);
1459 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1460 hdr->nr_body = (uintptr_t)req;
1461 /* Write back to the original struct. */
1462 req->nr_tx_slots = regreq.nr_tx_slots;
1463 req->nr_rx_slots = regreq.nr_rx_slots;
1464 req->nr_tx_rings = regreq.nr_tx_rings;
1465 req->nr_rx_rings = regreq.nr_rx_rings;
1466 req->nr_mem_id = regreq.nr_mem_id;
1467 return error;
1468 }
1469
1470 /* remove a persistent VALE port from the system */
1471 int
nm_vi_destroy(const char * name)1472 nm_vi_destroy(const char *name)
1473 {
1474 struct ifnet *ifp;
1475 struct netmap_vp_adapter *vpna;
1476 int error;
1477
1478 ifp = ifunit_ref(name);
1479 if (!ifp)
1480 return ENXIO;
1481 NMG_LOCK();
1482 /* make sure this is actually a VALE port */
1483 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1484 error = EINVAL;
1485 goto err;
1486 }
1487
1488 vpna = (struct netmap_vp_adapter *)NA(ifp);
1489
1490 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1491 if (vpna->autodelete) {
1492 error = EINVAL;
1493 goto err;
1494 }
1495
1496 /* also make sure that nobody is using the interface */
1497 if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1498 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1499 error = EBUSY;
1500 goto err;
1501 }
1502
1503 NMG_UNLOCK();
1504
1505 if (netmap_verbose)
1506 nm_prinf("destroying a persistent vale interface %s", ifp->if_xname);
1507 /* Linux requires all the references are released
1508 * before unregister
1509 */
1510 netmap_detach(ifp);
1511 if_rele(ifp);
1512 nm_os_vi_detach(ifp);
1513 return 0;
1514
1515 err:
1516 NMG_UNLOCK();
1517 if_rele(ifp);
1518 return error;
1519 }
1520
1521 static int
nm_update_info(struct nmreq_register * req,struct netmap_adapter * na)1522 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1523 {
1524 req->nr_rx_rings = na->num_rx_rings;
1525 req->nr_tx_rings = na->num_tx_rings;
1526 req->nr_rx_slots = na->num_rx_desc;
1527 req->nr_tx_slots = na->num_tx_desc;
1528 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1529 &req->nr_mem_id);
1530 }
1531
1532
1533 /*
1534 * Create a virtual interface registered to the system.
1535 * The interface will be attached to a bridge later.
1536 */
1537 int
netmap_vi_create(struct nmreq_header * hdr,int autodelete)1538 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1539 {
1540 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1541 struct ifnet *ifp;
1542 struct netmap_vp_adapter *vpna;
1543 struct netmap_mem_d *nmd = NULL;
1544 int error;
1545
1546 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1547 return EINVAL;
1548 }
1549
1550 /* don't include VALE prefix */
1551 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1552 return EINVAL;
1553 if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1554 return EINVAL;
1555 }
1556 ifp = ifunit_ref(hdr->nr_name);
1557 if (ifp) { /* already exist, cannot create new one */
1558 error = EEXIST;
1559 NMG_LOCK();
1560 if (NM_NA_VALID(ifp)) {
1561 int update_err = nm_update_info(req, NA(ifp));
1562 if (update_err)
1563 error = update_err;
1564 }
1565 NMG_UNLOCK();
1566 if_rele(ifp);
1567 return error;
1568 }
1569 error = nm_os_vi_persist(hdr->nr_name, &ifp);
1570 if (error)
1571 return error;
1572
1573 NMG_LOCK();
1574 if (req->nr_mem_id) {
1575 nmd = netmap_mem_find(req->nr_mem_id);
1576 if (nmd == NULL) {
1577 error = EINVAL;
1578 goto err_1;
1579 }
1580 }
1581 /* netmap_vp_create creates a struct netmap_vp_adapter */
1582 error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna);
1583 if (error) {
1584 if (netmap_debug & NM_DEBUG_VALE)
1585 nm_prerr("error %d", error);
1586 goto err_1;
1587 }
1588 /* persist-specific routines */
1589 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1590 if (!autodelete) {
1591 netmap_adapter_get(&vpna->up);
1592 } else {
1593 vpna->autodelete = 1;
1594 }
1595 NM_ATTACH_NA(ifp, &vpna->up);
1596 /* return the updated info */
1597 error = nm_update_info(req, &vpna->up);
1598 if (error) {
1599 goto err_2;
1600 }
1601 nm_prdis("returning nr_mem_id %d", req->nr_mem_id);
1602 if (nmd)
1603 netmap_mem_put(nmd);
1604 NMG_UNLOCK();
1605 nm_prdis("created %s", ifp->if_xname);
1606 return 0;
1607
1608 err_2:
1609 netmap_detach(ifp);
1610 err_1:
1611 if (nmd)
1612 netmap_mem_put(nmd);
1613 NMG_UNLOCK();
1614 nm_os_vi_detach(ifp);
1615
1616 return error;
1617 }
1618
1619 #endif /* WITH_VALE */
1620