1 /*
2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27 /*
28 * This module implements the VALE switch for netmap
29
30 --- VALE SWITCH ---
31
32 NMG_LOCK() serializes all modifications to switches and ports.
33 A switch cannot be deleted until all ports are gone.
34
35 For each switch, an SX lock (RWlock on linux) protects
36 deletion of ports. When configuring or deleting a new port, the
37 lock is acquired in exclusive mode (after holding NMG_LOCK).
38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39 The lock is held throughout the entire forwarding cycle,
40 during which the thread may incur in a page fault.
41 Hence it is important that sleepable shared locks are used.
42
43 On the rx ring, the per-port lock is grabbed initially to reserve
44 a number of slot in the ring, then the lock is released,
45 packets are copied from source to destination, and then
46 the lock is acquired again and the receive ring is updated.
47 (A similar thing is done on the tx ring for NIC and host stack
48 ports attached to the switch)
49
50 */
51
52 /*
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
56 */
57
58 #if defined(__FreeBSD__)
59 #include <sys/cdefs.h> /* prerequisite */
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/types.h>
63 #include <sys/errno.h>
64 #include <sys/param.h> /* defines used in kernel.h */
65 #include <sys/kernel.h> /* types used in module initialization */
66 #include <sys/conf.h> /* cdevsw struct, UID, GID */
67 #include <sys/sockio.h>
68 #include <sys/socketvar.h> /* struct socket */
69 #include <sys/malloc.h>
70 #include <sys/poll.h>
71 #include <sys/rwlock.h>
72 #include <sys/socket.h> /* sockaddrs */
73 #include <sys/selinfo.h>
74 #include <sys/sysctl.h>
75 #include <net/if.h>
76 #include <net/if_var.h>
77 #include <net/bpf.h> /* BIOCIMMEDIATE */
78 #include <machine/bus.h> /* bus_dmamap_* */
79 #include <sys/endian.h>
80 #include <sys/refcount.h>
81
82
83 #define BDG_RWLOCK_T struct rwlock // struct rwlock
84
85 #define BDG_RWINIT(b) \
86 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
88 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
89 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
90 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock)
91 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
92 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock)
93
94
95 #elif defined(linux)
96
97 #include "bsd_glue.h"
98
99 #elif defined(__APPLE__)
100
101 #warning OSX support is only partial
102 #include "osx_glue.h"
103
104 #else
105
106 #error Unsupported platform
107
108 #endif /* unsupported */
109
110 /*
111 * common headers
112 */
113
114 #include <net/netmap.h>
115 #include <dev/netmap/netmap_kern.h>
116 #include <dev/netmap/netmap_mem2.h>
117
118 #ifdef WITH_VALE
119
120 /*
121 * system parameters (most of them in netmap_kern.h)
122 * NM_NAME prefix for switch port names, default "vale"
123 * NM_BDG_MAXPORTS number of ports
124 * NM_BRIDGES max number of switches in the system.
125 * XXX should become a sysctl or tunable
126 *
127 * Switch ports are named valeX:Y where X is the switch name and Y
128 * is the port. If Y matches a physical interface name, the port is
129 * connected to a physical device.
130 *
131 * Unlike physical interfaces, switch ports use their own memory region
132 * for rings and buffers.
133 * The virtual interfaces use per-queue lock instead of core lock.
134 * In the tx loop, we aggregate traffic in batches to make all operations
135 * faster. The batch size is bridge_batch.
136 */
137 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
138 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
139 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
140 #define NM_BDG_HASH 1024 /* forwarding table entries */
141 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
142 #define NM_MULTISEG 64 /* max size of a chain of bufs */
143 /* actual size of the tables */
144 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
145 /* NM_FT_NULL terminates a list of slots in the ft */
146 #define NM_FT_NULL NM_BDG_BATCH_MAX
147 #define NM_BRIDGES 8 /* number of bridges */
148
149
150 /*
151 * bridge_batch is set via sysctl to the max batch size to be
152 * used in the bridge. The actual value may be larger as the
153 * last packet in the block may overflow the size.
154 */
155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156 SYSCTL_DECL(_dev_netmap);
157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158
159
160 static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **);
161 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
162 static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
163
164 /*
165 * For each output interface, nm_bdg_q is used to construct a list.
166 * bq_len is the number of output buffers (we can have coalescing
167 * during the copy).
168 */
169 struct nm_bdg_q {
170 uint16_t bq_head;
171 uint16_t bq_tail;
172 uint32_t bq_len; /* number of buffers */
173 };
174
175 /* XXX revise this */
176 struct nm_hash_ent {
177 uint64_t mac; /* the top 2 bytes are the epoch */
178 uint64_t ports;
179 };
180
181 /*
182 * nm_bridge is a descriptor for a VALE switch.
183 * Interfaces for a bridge are all in bdg_ports[].
184 * The array has fixed size, an empty entry does not terminate
185 * the search, but lookups only occur on attach/detach so we
186 * don't mind if they are slow.
187 *
188 * The bridge is non blocking on the transmit ports: excess
189 * packets are dropped if there is no room on the output port.
190 *
191 * bdg_lock protects accesses to the bdg_ports array.
192 * This is a rw lock (or equivalent).
193 */
194 struct nm_bridge {
195 /* XXX what is the proper alignment/layout ? */
196 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
197 int bdg_namelen;
198 uint32_t bdg_active_ports; /* 0 means free */
199 char bdg_basename[IFNAMSIZ];
200
201 /* Indexes of active ports (up to active_ports)
202 * and all other remaining ports.
203 */
204 uint8_t bdg_port_index[NM_BDG_MAXPORTS];
205
206 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
207
208
209 /*
210 * The function to decide the destination port.
211 * It returns either of an index of the destination port,
212 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
213 * forward this packet. ring_nr is the source ring index, and the
214 * function may overwrite this value to forward this packet to a
215 * different ring index.
216 * This function must be set by netmap_bdgctl().
217 */
218 struct netmap_bdg_ops bdg_ops;
219
220 /* the forwarding table, MAC+ports.
221 * XXX should be changed to an argument to be passed to
222 * the lookup function, and allocated on attach
223 */
224 struct nm_hash_ent ht[NM_BDG_HASH];
225
226 #ifdef CONFIG_NET_NS
227 struct net *ns;
228 #endif /* CONFIG_NET_NS */
229 };
230
231 const char*
netmap_bdg_name(struct netmap_vp_adapter * vp)232 netmap_bdg_name(struct netmap_vp_adapter *vp)
233 {
234 struct nm_bridge *b = vp->na_bdg;
235 if (b == NULL)
236 return NULL;
237 return b->bdg_basename;
238 }
239
240
241 #ifndef CONFIG_NET_NS
242 /*
243 * XXX in principle nm_bridges could be created dynamically
244 * Right now we have a static array and deletions are protected
245 * by an exclusive lock.
246 */
247 struct nm_bridge *nm_bridges;
248 #endif /* !CONFIG_NET_NS */
249
250
251 /*
252 * this is a slightly optimized copy routine which rounds
253 * to multiple of 64 bytes and is often faster than dealing
254 * with other odd sizes. We assume there is enough room
255 * in the source and destination buffers.
256 *
257 * XXX only for multiples of 64 bytes, non overlapped.
258 */
259 static inline void
pkt_copy(void * _src,void * _dst,int l)260 pkt_copy(void *_src, void *_dst, int l)
261 {
262 uint64_t *src = _src;
263 uint64_t *dst = _dst;
264 if (unlikely(l >= 1024)) {
265 memcpy(dst, src, l);
266 return;
267 }
268 for (; likely(l > 0); l-=64) {
269 *dst++ = *src++;
270 *dst++ = *src++;
271 *dst++ = *src++;
272 *dst++ = *src++;
273 *dst++ = *src++;
274 *dst++ = *src++;
275 *dst++ = *src++;
276 *dst++ = *src++;
277 }
278 }
279
280
281 /*
282 * locate a bridge among the existing ones.
283 * MUST BE CALLED WITH NMG_LOCK()
284 *
285 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
286 * We assume that this is called with a name of at least NM_NAME chars.
287 */
288 static struct nm_bridge *
nm_find_bridge(const char * name,int create)289 nm_find_bridge(const char *name, int create)
290 {
291 int i, l, namelen;
292 struct nm_bridge *b = NULL, *bridges;
293 u_int num_bridges;
294
295 NMG_LOCK_ASSERT();
296
297 netmap_bns_getbridges(&bridges, &num_bridges);
298
299 namelen = strlen(NM_NAME); /* base length */
300 l = name ? strlen(name) : 0; /* actual length */
301 if (l < namelen) {
302 D("invalid bridge name %s", name ? name : NULL);
303 return NULL;
304 }
305 for (i = namelen + 1; i < l; i++) {
306 if (name[i] == ':') {
307 namelen = i;
308 break;
309 }
310 }
311 if (namelen >= IFNAMSIZ)
312 namelen = IFNAMSIZ;
313 ND("--- prefix is '%.*s' ---", namelen, name);
314
315 /* lookup the name, remember empty slot if there is one */
316 for (i = 0; i < num_bridges; i++) {
317 struct nm_bridge *x = bridges + i;
318
319 if (x->bdg_active_ports == 0) {
320 if (create && b == NULL)
321 b = x; /* record empty slot */
322 } else if (x->bdg_namelen != namelen) {
323 continue;
324 } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
325 ND("found '%.*s' at %d", namelen, name, i);
326 b = x;
327 break;
328 }
329 }
330 if (i == num_bridges && b) { /* name not found, can create entry */
331 /* initialize the bridge */
332 strncpy(b->bdg_basename, name, namelen);
333 ND("create new bridge %s with ports %d", b->bdg_basename,
334 b->bdg_active_ports);
335 b->bdg_namelen = namelen;
336 b->bdg_active_ports = 0;
337 for (i = 0; i < NM_BDG_MAXPORTS; i++)
338 b->bdg_port_index[i] = i;
339 /* set the default function */
340 b->bdg_ops.lookup = netmap_bdg_learning;
341 /* reset the MAC address table */
342 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
343 NM_BNS_GET(b);
344 }
345 return b;
346 }
347
348
349 /*
350 * Free the forwarding tables for rings attached to switch ports.
351 */
352 static void
nm_free_bdgfwd(struct netmap_adapter * na)353 nm_free_bdgfwd(struct netmap_adapter *na)
354 {
355 int nrings, i;
356 struct netmap_kring *kring;
357
358 NMG_LOCK_ASSERT();
359 nrings = na->num_tx_rings;
360 kring = na->tx_rings;
361 for (i = 0; i < nrings; i++) {
362 if (kring[i].nkr_ft) {
363 free(kring[i].nkr_ft, M_NETMAP);
364 kring[i].nkr_ft = NULL; /* protect from freeing twice */
365 }
366 }
367 }
368
369
370 /*
371 * Allocate the forwarding tables for the rings attached to the bridge ports.
372 */
373 static int
nm_alloc_bdgfwd(struct netmap_adapter * na)374 nm_alloc_bdgfwd(struct netmap_adapter *na)
375 {
376 int nrings, l, i, num_dstq;
377 struct netmap_kring *kring;
378
379 NMG_LOCK_ASSERT();
380 /* all port:rings + broadcast */
381 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
382 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
383 l += sizeof(struct nm_bdg_q) * num_dstq;
384 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
385
386 nrings = netmap_real_rings(na, NR_TX);
387 kring = na->tx_rings;
388 for (i = 0; i < nrings; i++) {
389 struct nm_bdg_fwd *ft;
390 struct nm_bdg_q *dstq;
391 int j;
392
393 ft = malloc(l, M_NETMAP, M_NOWAIT | M_ZERO);
394 if (!ft) {
395 nm_free_bdgfwd(na);
396 return ENOMEM;
397 }
398 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
399 for (j = 0; j < num_dstq; j++) {
400 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
401 dstq[j].bq_len = 0;
402 }
403 kring[i].nkr_ft = ft;
404 }
405 return 0;
406 }
407
408
409 /* remove from bridge b the ports in slots hw and sw
410 * (sw can be -1 if not needed)
411 */
412 static void
netmap_bdg_detach_common(struct nm_bridge * b,int hw,int sw)413 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
414 {
415 int s_hw = hw, s_sw = sw;
416 int i, lim =b->bdg_active_ports;
417 uint8_t tmp[NM_BDG_MAXPORTS];
418
419 /*
420 New algorithm:
421 make a copy of bdg_port_index;
422 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
423 in the array of bdg_port_index, replacing them with
424 entries from the bottom of the array;
425 decrement bdg_active_ports;
426 acquire BDG_WLOCK() and copy back the array.
427 */
428
429 if (netmap_verbose)
430 D("detach %d and %d (lim %d)", hw, sw, lim);
431 /* make a copy of the list of active ports, update it,
432 * and then copy back within BDG_WLOCK().
433 */
434 memcpy(tmp, b->bdg_port_index, sizeof(tmp));
435 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
436 if (hw >= 0 && tmp[i] == hw) {
437 ND("detach hw %d at %d", hw, i);
438 lim--; /* point to last active port */
439 tmp[i] = tmp[lim]; /* swap with i */
440 tmp[lim] = hw; /* now this is inactive */
441 hw = -1;
442 } else if (sw >= 0 && tmp[i] == sw) {
443 ND("detach sw %d at %d", sw, i);
444 lim--;
445 tmp[i] = tmp[lim];
446 tmp[lim] = sw;
447 sw = -1;
448 } else {
449 i++;
450 }
451 }
452 if (hw >= 0 || sw >= 0) {
453 D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
454 }
455
456 BDG_WLOCK(b);
457 if (b->bdg_ops.dtor)
458 b->bdg_ops.dtor(b->bdg_ports[s_hw]);
459 b->bdg_ports[s_hw] = NULL;
460 if (s_sw >= 0) {
461 b->bdg_ports[s_sw] = NULL;
462 }
463 memcpy(b->bdg_port_index, tmp, sizeof(tmp));
464 b->bdg_active_ports = lim;
465 BDG_WUNLOCK(b);
466
467 ND("now %d active ports", lim);
468 if (lim == 0) {
469 ND("marking bridge %s as free", b->bdg_basename);
470 bzero(&b->bdg_ops, sizeof(b->bdg_ops));
471 NM_BNS_PUT(b);
472 }
473 }
474
475 /* nm_bdg_ctl callback for VALE ports */
476 static int
netmap_vp_bdg_ctl(struct netmap_adapter * na,struct nmreq * nmr,int attach)477 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
478 {
479 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
480 struct nm_bridge *b = vpna->na_bdg;
481
482 if (attach)
483 return 0; /* nothing to do */
484 if (b) {
485 netmap_set_all_rings(na, 0 /* disable */);
486 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
487 vpna->na_bdg = NULL;
488 netmap_set_all_rings(na, 1 /* enable */);
489 }
490 /* I have took reference just for attach */
491 netmap_adapter_put(na);
492 return 0;
493 }
494
495 /* nm_dtor callback for ephemeral VALE ports */
496 static void
netmap_vp_dtor(struct netmap_adapter * na)497 netmap_vp_dtor(struct netmap_adapter *na)
498 {
499 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
500 struct nm_bridge *b = vpna->na_bdg;
501
502 ND("%s has %d references", na->name, na->na_refcount);
503
504 if (b) {
505 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
506 }
507 }
508
509 /* remove a persistent VALE port from the system */
510 static int
nm_vi_destroy(const char * name)511 nm_vi_destroy(const char *name)
512 {
513 struct ifnet *ifp;
514 int error;
515
516 ifp = ifunit_ref(name);
517 if (!ifp)
518 return ENXIO;
519 NMG_LOCK();
520 /* make sure this is actually a VALE port */
521 if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
522 error = EINVAL;
523 goto err;
524 }
525
526 if (NA(ifp)->na_refcount > 1) {
527 error = EBUSY;
528 goto err;
529 }
530 NMG_UNLOCK();
531
532 D("destroying a persistent vale interface %s", ifp->if_xname);
533 /* Linux requires all the references are released
534 * before unregister
535 */
536 if_rele(ifp);
537 netmap_detach(ifp);
538 nm_vi_detach(ifp);
539 return 0;
540
541 err:
542 NMG_UNLOCK();
543 if_rele(ifp);
544 return error;
545 }
546
547 /*
548 * Create a virtual interface registered to the system.
549 * The interface will be attached to a bridge later.
550 */
551 static int
nm_vi_create(struct nmreq * nmr)552 nm_vi_create(struct nmreq *nmr)
553 {
554 struct ifnet *ifp;
555 struct netmap_vp_adapter *vpna;
556 int error;
557
558 /* don't include VALE prefix */
559 if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME)))
560 return EINVAL;
561 ifp = ifunit_ref(nmr->nr_name);
562 if (ifp) { /* already exist, cannot create new one */
563 if_rele(ifp);
564 return EEXIST;
565 }
566 error = nm_vi_persist(nmr->nr_name, &ifp);
567 if (error)
568 return error;
569
570 NMG_LOCK();
571 /* netmap_vp_create creates a struct netmap_vp_adapter */
572 error = netmap_vp_create(nmr, ifp, &vpna);
573 if (error) {
574 D("error %d", error);
575 nm_vi_detach(ifp);
576 return error;
577 }
578 /* persist-specific routines */
579 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
580 netmap_adapter_get(&vpna->up);
581 NMG_UNLOCK();
582 D("created %s", ifp->if_xname);
583 return 0;
584 }
585
586 /* Try to get a reference to a netmap adapter attached to a VALE switch.
587 * If the adapter is found (or is created), this function returns 0, a
588 * non NULL pointer is returned into *na, and the caller holds a
589 * reference to the adapter.
590 * If an adapter is not found, then no reference is grabbed and the
591 * function returns an error code, or 0 if there is just a VALE prefix
592 * mismatch. Therefore the caller holds a reference when
593 * (*na != NULL && return == 0).
594 */
595 int
netmap_get_bdg_na(struct nmreq * nmr,struct netmap_adapter ** na,int create)596 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
597 {
598 char *nr_name = nmr->nr_name;
599 const char *ifname;
600 struct ifnet *ifp;
601 int error = 0;
602 struct netmap_vp_adapter *vpna, *hostna = NULL;
603 struct nm_bridge *b;
604 int i, j, cand = -1, cand2 = -1;
605 int needed;
606
607 *na = NULL; /* default return value */
608
609 /* first try to see if this is a bridge port. */
610 NMG_LOCK_ASSERT();
611 if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) {
612 return 0; /* no error, but no VALE prefix */
613 }
614
615 b = nm_find_bridge(nr_name, create);
616 if (b == NULL) {
617 D("no bridges available for '%s'", nr_name);
618 return (create ? ENOMEM : ENXIO);
619 }
620 if (strlen(nr_name) < b->bdg_namelen) /* impossible */
621 panic("x");
622
623 /* Now we are sure that name starts with the bridge's name,
624 * lookup the port in the bridge. We need to scan the entire
625 * list. It is not important to hold a WLOCK on the bridge
626 * during the search because NMG_LOCK already guarantees
627 * that there are no other possible writers.
628 */
629
630 /* lookup in the local list of ports */
631 for (j = 0; j < b->bdg_active_ports; j++) {
632 i = b->bdg_port_index[j];
633 vpna = b->bdg_ports[i];
634 // KASSERT(na != NULL);
635 ND("checking %s", vpna->up.name);
636 if (!strcmp(vpna->up.name, nr_name)) {
637 netmap_adapter_get(&vpna->up);
638 ND("found existing if %s refs %d", nr_name)
639 *na = &vpna->up;
640 return 0;
641 }
642 }
643 /* not found, should we create it? */
644 if (!create)
645 return ENXIO;
646 /* yes we should, see if we have space to attach entries */
647 needed = 2; /* in some cases we only need 1 */
648 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
649 D("bridge full %d, cannot create new port", b->bdg_active_ports);
650 return ENOMEM;
651 }
652 /* record the next two ports available, but do not allocate yet */
653 cand = b->bdg_port_index[b->bdg_active_ports];
654 cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
655 ND("+++ bridge %s port %s used %d avail %d %d",
656 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
657
658 /*
659 * try see if there is a matching NIC with this name
660 * (after the bridge's name)
661 */
662 ifname = nr_name + b->bdg_namelen + 1;
663 ifp = ifunit_ref(ifname);
664 if (!ifp) {
665 /* Create an ephemeral virtual port
666 * This block contains all the ephemeral-specific logics
667 */
668 if (nmr->nr_cmd) {
669 /* nr_cmd must be 0 for a virtual port */
670 return EINVAL;
671 }
672
673 /* bdg_netmap_attach creates a struct netmap_adapter */
674 error = netmap_vp_create(nmr, NULL, &vpna);
675 if (error) {
676 D("error %d", error);
677 free(ifp, M_NETMAP);
678 return error;
679 }
680 /* shortcut - we can skip get_hw_na(),
681 * ownership check and nm_bdg_attach()
682 */
683 } else {
684 struct netmap_adapter *hw;
685
686 error = netmap_get_hw_na(ifp, &hw);
687 if (error || hw == NULL)
688 goto out;
689
690 /* host adapter might not be created */
691 error = hw->nm_bdg_attach(nr_name, hw);
692 if (error)
693 goto out;
694 vpna = hw->na_vp;
695 hostna = hw->na_hostvp;
696 if_rele(ifp);
697 if (nmr->nr_arg1 != NETMAP_BDG_HOST)
698 hostna = NULL;
699 }
700
701 BDG_WLOCK(b);
702 vpna->bdg_port = cand;
703 ND("NIC %p to bridge port %d", vpna, cand);
704 /* bind the port to the bridge (virtual ports are not active) */
705 b->bdg_ports[cand] = vpna;
706 vpna->na_bdg = b;
707 b->bdg_active_ports++;
708 if (hostna != NULL) {
709 /* also bind the host stack to the bridge */
710 b->bdg_ports[cand2] = hostna;
711 hostna->bdg_port = cand2;
712 hostna->na_bdg = b;
713 b->bdg_active_ports++;
714 ND("host %p to bridge port %d", hostna, cand2);
715 }
716 ND("if %s refs %d", ifname, vpna->up.na_refcount);
717 BDG_WUNLOCK(b);
718 *na = &vpna->up;
719 netmap_adapter_get(*na);
720 return 0;
721
722 out:
723 if_rele(ifp);
724
725 return error;
726 }
727
728
729 /* Process NETMAP_BDG_ATTACH */
730 static int
nm_bdg_ctl_attach(struct nmreq * nmr)731 nm_bdg_ctl_attach(struct nmreq *nmr)
732 {
733 struct netmap_adapter *na;
734 int error;
735
736 NMG_LOCK();
737
738 error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
739 if (error) /* no device */
740 goto unlock_exit;
741
742 if (na == NULL) { /* VALE prefix missing */
743 error = EINVAL;
744 goto unlock_exit;
745 }
746
747 if (NETMAP_OWNED_BY_ANY(na)) {
748 error = EBUSY;
749 goto unref_exit;
750 }
751
752 if (na->nm_bdg_ctl) {
753 /* nop for VALE ports. The bwrap needs to put the hwna
754 * in netmap mode (see netmap_bwrap_bdg_ctl)
755 */
756 error = na->nm_bdg_ctl(na, nmr, 1);
757 if (error)
758 goto unref_exit;
759 ND("registered %s to netmap-mode", na->name);
760 }
761 NMG_UNLOCK();
762 return 0;
763
764 unref_exit:
765 netmap_adapter_put(na);
766 unlock_exit:
767 NMG_UNLOCK();
768 return error;
769 }
770
771
772 /* process NETMAP_BDG_DETACH */
773 static int
nm_bdg_ctl_detach(struct nmreq * nmr)774 nm_bdg_ctl_detach(struct nmreq *nmr)
775 {
776 struct netmap_adapter *na;
777 int error;
778
779 NMG_LOCK();
780 error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
781 if (error) { /* no device, or another bridge or user owns the device */
782 goto unlock_exit;
783 }
784
785 if (na == NULL) { /* VALE prefix missing */
786 error = EINVAL;
787 goto unlock_exit;
788 }
789
790 if (na->nm_bdg_ctl) {
791 /* remove the port from bridge. The bwrap
792 * also needs to put the hwna in normal mode
793 */
794 error = na->nm_bdg_ctl(na, nmr, 0);
795 }
796
797 netmap_adapter_put(na);
798 unlock_exit:
799 NMG_UNLOCK();
800 return error;
801
802 }
803
804
805 /* Called by either user's context (netmap_ioctl())
806 * or external kernel modules (e.g., Openvswitch).
807 * Operation is indicated in nmr->nr_cmd.
808 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
809 * requires bdg_ops argument; the other commands ignore this argument.
810 *
811 * Called without NMG_LOCK.
812 */
813 int
netmap_bdg_ctl(struct nmreq * nmr,struct netmap_bdg_ops * bdg_ops)814 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
815 {
816 struct nm_bridge *b, *bridges;
817 struct netmap_adapter *na;
818 struct netmap_vp_adapter *vpna;
819 char *name = nmr->nr_name;
820 int cmd = nmr->nr_cmd, namelen = strlen(name);
821 int error = 0, i, j;
822 u_int num_bridges;
823
824 netmap_bns_getbridges(&bridges, &num_bridges);
825
826 switch (cmd) {
827 case NETMAP_BDG_NEWIF:
828 error = nm_vi_create(nmr);
829 break;
830
831 case NETMAP_BDG_DELIF:
832 error = nm_vi_destroy(nmr->nr_name);
833 break;
834
835 case NETMAP_BDG_ATTACH:
836 error = nm_bdg_ctl_attach(nmr);
837 break;
838
839 case NETMAP_BDG_DETACH:
840 error = nm_bdg_ctl_detach(nmr);
841 break;
842
843 case NETMAP_BDG_LIST:
844 /* this is used to enumerate bridges and ports */
845 if (namelen) { /* look up indexes of bridge and port */
846 if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
847 error = EINVAL;
848 break;
849 }
850 NMG_LOCK();
851 b = nm_find_bridge(name, 0 /* don't create */);
852 if (!b) {
853 error = ENOENT;
854 NMG_UNLOCK();
855 break;
856 }
857
858 error = ENOENT;
859 for (j = 0; j < b->bdg_active_ports; j++) {
860 i = b->bdg_port_index[j];
861 vpna = b->bdg_ports[i];
862 if (vpna == NULL) {
863 D("---AAAAAAAAARGH-------");
864 continue;
865 }
866 /* the former and the latter identify a
867 * virtual port and a NIC, respectively
868 */
869 if (!strcmp(vpna->up.name, name)) {
870 /* bridge index */
871 nmr->nr_arg1 = b - bridges;
872 nmr->nr_arg2 = i; /* port index */
873 error = 0;
874 break;
875 }
876 }
877 NMG_UNLOCK();
878 } else {
879 /* return the first non-empty entry starting from
880 * bridge nr_arg1 and port nr_arg2.
881 *
882 * Users can detect the end of the same bridge by
883 * seeing the new and old value of nr_arg1, and can
884 * detect the end of all the bridge by error != 0
885 */
886 i = nmr->nr_arg1;
887 j = nmr->nr_arg2;
888
889 NMG_LOCK();
890 for (error = ENOENT; i < NM_BRIDGES; i++) {
891 b = bridges + i;
892 if (j >= b->bdg_active_ports) {
893 j = 0; /* following bridges scan from 0 */
894 continue;
895 }
896 nmr->nr_arg1 = i;
897 nmr->nr_arg2 = j;
898 j = b->bdg_port_index[j];
899 vpna = b->bdg_ports[j];
900 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
901 error = 0;
902 break;
903 }
904 NMG_UNLOCK();
905 }
906 break;
907
908 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
909 /* register callbacks to the given bridge.
910 * nmr->nr_name may be just bridge's name (including ':'
911 * if it is not just NM_NAME).
912 */
913 if (!bdg_ops) {
914 error = EINVAL;
915 break;
916 }
917 NMG_LOCK();
918 b = nm_find_bridge(name, 0 /* don't create */);
919 if (!b) {
920 error = EINVAL;
921 } else {
922 b->bdg_ops = *bdg_ops;
923 }
924 NMG_UNLOCK();
925 break;
926
927 case NETMAP_BDG_VNET_HDR:
928 /* Valid lengths for the virtio-net header are 0 (no header),
929 10 and 12. */
930 if (nmr->nr_arg1 != 0 &&
931 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
932 nmr->nr_arg1 != 12) {
933 error = EINVAL;
934 break;
935 }
936 NMG_LOCK();
937 error = netmap_get_bdg_na(nmr, &na, 0);
938 if (na && !error) {
939 vpna = (struct netmap_vp_adapter *)na;
940 vpna->virt_hdr_len = nmr->nr_arg1;
941 if (vpna->virt_hdr_len)
942 vpna->mfs = NETMAP_BUF_SIZE(na);
943 D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
944 netmap_adapter_put(na);
945 }
946 NMG_UNLOCK();
947 break;
948
949 default:
950 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
951 error = EINVAL;
952 break;
953 }
954 return error;
955 }
956
957 int
netmap_bdg_config(struct nmreq * nmr)958 netmap_bdg_config(struct nmreq *nmr)
959 {
960 struct nm_bridge *b;
961 int error = EINVAL;
962
963 NMG_LOCK();
964 b = nm_find_bridge(nmr->nr_name, 0);
965 if (!b) {
966 NMG_UNLOCK();
967 return error;
968 }
969 NMG_UNLOCK();
970 /* Don't call config() with NMG_LOCK() held */
971 BDG_RLOCK(b);
972 if (b->bdg_ops.config != NULL)
973 error = b->bdg_ops.config((struct nm_ifreq *)nmr);
974 BDG_RUNLOCK(b);
975 return error;
976 }
977
978
979 /* nm_krings_create callback for VALE ports.
980 * Calls the standard netmap_krings_create, then adds leases on rx
981 * rings and bdgfwd on tx rings.
982 */
983 static int
netmap_vp_krings_create(struct netmap_adapter * na)984 netmap_vp_krings_create(struct netmap_adapter *na)
985 {
986 u_int tailroom;
987 int error, i;
988 uint32_t *leases;
989 u_int nrx = netmap_real_rings(na, NR_RX);
990
991 /*
992 * Leases are attached to RX rings on vale ports
993 */
994 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
995
996 error = netmap_krings_create(na, tailroom);
997 if (error)
998 return error;
999
1000 leases = na->tailroom;
1001
1002 for (i = 0; i < nrx; i++) { /* Receive rings */
1003 na->rx_rings[i].nkr_leases = leases;
1004 leases += na->num_rx_desc;
1005 }
1006
1007 error = nm_alloc_bdgfwd(na);
1008 if (error) {
1009 netmap_krings_delete(na);
1010 return error;
1011 }
1012
1013 return 0;
1014 }
1015
1016
1017 /* nm_krings_delete callback for VALE ports. */
1018 static void
netmap_vp_krings_delete(struct netmap_adapter * na)1019 netmap_vp_krings_delete(struct netmap_adapter *na)
1020 {
1021 nm_free_bdgfwd(na);
1022 netmap_krings_delete(na);
1023 }
1024
1025
1026 static int
1027 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1028 struct netmap_vp_adapter *na, u_int ring_nr);
1029
1030
1031 /*
1032 * main dispatch routine for the bridge.
1033 * Grab packets from a kring, move them into the ft structure
1034 * associated to the tx (input) port. Max one instance per port,
1035 * filtered on input (ioctl, poll or XXX).
1036 * Returns the next position in the ring.
1037 */
1038 static int
nm_bdg_preflush(struct netmap_kring * kring,u_int end)1039 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1040 {
1041 struct netmap_vp_adapter *na =
1042 (struct netmap_vp_adapter*)kring->na;
1043 struct netmap_ring *ring = kring->ring;
1044 struct nm_bdg_fwd *ft;
1045 u_int ring_nr = kring->ring_id;
1046 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1047 u_int ft_i = 0; /* start from 0 */
1048 u_int frags = 1; /* how many frags ? */
1049 struct nm_bridge *b = na->na_bdg;
1050
1051 /* To protect against modifications to the bridge we acquire a
1052 * shared lock, waiting if we can sleep (if the source port is
1053 * attached to a user process) or with a trylock otherwise (NICs).
1054 */
1055 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1056 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1057 BDG_RLOCK(b);
1058 else if (!BDG_RTRYLOCK(b))
1059 return 0;
1060 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1061 ft = kring->nkr_ft;
1062
1063 for (; likely(j != end); j = nm_next(j, lim)) {
1064 struct netmap_slot *slot = &ring->slot[j];
1065 char *buf;
1066
1067 ft[ft_i].ft_len = slot->len;
1068 ft[ft_i].ft_flags = slot->flags;
1069
1070 ND("flags is 0x%x", slot->flags);
1071 /* we do not use the buf changed flag, but we still need to reset it */
1072 slot->flags &= ~NS_BUF_CHANGED;
1073
1074 /* this slot goes into a list so initialize the link field */
1075 ft[ft_i].ft_next = NM_FT_NULL;
1076 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1077 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1078 if (unlikely(buf == NULL)) {
1079 RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1080 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1081 kring->name, j, ft[ft_i].ft_len);
1082 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1083 ft[ft_i].ft_len = 0;
1084 ft[ft_i].ft_flags = 0;
1085 }
1086 __builtin_prefetch(buf);
1087 ++ft_i;
1088 if (slot->flags & NS_MOREFRAG) {
1089 frags++;
1090 continue;
1091 }
1092 if (unlikely(netmap_verbose && frags > 1))
1093 RD(5, "%d frags at %d", frags, ft_i - frags);
1094 ft[ft_i - frags].ft_frags = frags;
1095 frags = 1;
1096 if (unlikely((int)ft_i >= bridge_batch))
1097 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1098 }
1099 if (frags > 1) {
1100 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1101 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
1102 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
1103 ft[ft_i - frags].ft_frags = frags - 1;
1104 }
1105 if (ft_i)
1106 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1107 BDG_RUNLOCK(b);
1108 return j;
1109 }
1110
1111
1112 /* ----- FreeBSD if_bridge hash function ------- */
1113
1114 /*
1115 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1116 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1117 *
1118 * http://www.burtleburtle.net/bob/hash/spooky.html
1119 */
1120 #define mix(a, b, c) \
1121 do { \
1122 a -= b; a -= c; a ^= (c >> 13); \
1123 b -= c; b -= a; b ^= (a << 8); \
1124 c -= a; c -= b; c ^= (b >> 13); \
1125 a -= b; a -= c; a ^= (c >> 12); \
1126 b -= c; b -= a; b ^= (a << 16); \
1127 c -= a; c -= b; c ^= (b >> 5); \
1128 a -= b; a -= c; a ^= (c >> 3); \
1129 b -= c; b -= a; b ^= (a << 10); \
1130 c -= a; c -= b; c ^= (b >> 15); \
1131 } while (/*CONSTCOND*/0)
1132
1133
1134 static __inline uint32_t
nm_bridge_rthash(const uint8_t * addr)1135 nm_bridge_rthash(const uint8_t *addr)
1136 {
1137 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1138
1139 b += addr[5] << 8;
1140 b += addr[4];
1141 a += addr[3] << 24;
1142 a += addr[2] << 16;
1143 a += addr[1] << 8;
1144 a += addr[0];
1145
1146 mix(a, b, c);
1147 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
1148 return (c & BRIDGE_RTHASH_MASK);
1149 }
1150
1151 #undef mix
1152
1153
1154 /* nm_register callback for VALE ports */
1155 static int
netmap_vp_reg(struct netmap_adapter * na,int onoff)1156 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1157 {
1158 struct netmap_vp_adapter *vpna =
1159 (struct netmap_vp_adapter*)na;
1160
1161 /* persistent ports may be put in netmap mode
1162 * before being attached to a bridge
1163 */
1164 if (vpna->na_bdg)
1165 BDG_WLOCK(vpna->na_bdg);
1166 if (onoff) {
1167 na->na_flags |= NAF_NETMAP_ON;
1168 /* XXX on FreeBSD, persistent VALE ports should also
1169 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1170 */
1171 } else {
1172 na->na_flags &= ~NAF_NETMAP_ON;
1173 }
1174 if (vpna->na_bdg)
1175 BDG_WUNLOCK(vpna->na_bdg);
1176 return 0;
1177 }
1178
1179
1180 /*
1181 * Lookup function for a learning bridge.
1182 * Update the hash table with the source address,
1183 * and then returns the destination port index, and the
1184 * ring in *dst_ring (at the moment, always use ring 0)
1185 */
1186 u_int
netmap_bdg_learning(struct nm_bdg_fwd * ft,uint8_t * dst_ring,struct netmap_vp_adapter * na)1187 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1188 struct netmap_vp_adapter *na)
1189 {
1190 uint8_t *buf = ft->ft_buf;
1191 u_int buf_len = ft->ft_len;
1192 struct nm_hash_ent *ht = na->na_bdg->ht;
1193 uint32_t sh, dh;
1194 u_int dst, mysrc = na->bdg_port;
1195 uint64_t smac, dmac;
1196
1197 /* safety check, unfortunately we have many cases */
1198 if (buf_len >= 14 + na->virt_hdr_len) {
1199 /* virthdr + mac_hdr in the same slot */
1200 buf += na->virt_hdr_len;
1201 buf_len -= na->virt_hdr_len;
1202 } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1203 /* only header in first fragment */
1204 ft++;
1205 buf = ft->ft_buf;
1206 buf_len = ft->ft_len;
1207 } else {
1208 RD(5, "invalid buf format, length %d", buf_len);
1209 return NM_BDG_NOPORT;
1210 }
1211 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1212 smac = le64toh(*(uint64_t *)(buf + 4));
1213 smac >>= 16;
1214
1215 /*
1216 * The hash is somewhat expensive, there might be some
1217 * worthwhile optimizations here.
1218 */
1219 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1220 uint8_t *s = buf+6;
1221 sh = nm_bridge_rthash(s); // XXX hash of source
1222 /* update source port forwarding entry */
1223 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */
1224 ht[sh].ports = mysrc;
1225 if (netmap_verbose)
1226 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1227 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1228 }
1229 dst = NM_BDG_BROADCAST;
1230 if ((buf[0] & 1) == 0) { /* unicast */
1231 dh = nm_bridge_rthash(buf); // XXX hash of dst
1232 if (ht[dh].mac == dmac) { /* found dst */
1233 dst = ht[dh].ports;
1234 }
1235 /* XXX otherwise return NM_BDG_UNKNOWN ? */
1236 }
1237 return dst;
1238 }
1239
1240
1241 /*
1242 * Available space in the ring. Only used in VALE code
1243 * and only with is_rx = 1
1244 */
1245 static inline uint32_t
nm_kr_space(struct netmap_kring * k,int is_rx)1246 nm_kr_space(struct netmap_kring *k, int is_rx)
1247 {
1248 int space;
1249
1250 if (is_rx) {
1251 int busy = k->nkr_hwlease - k->nr_hwcur;
1252 if (busy < 0)
1253 busy += k->nkr_num_slots;
1254 space = k->nkr_num_slots - 1 - busy;
1255 } else {
1256 /* XXX never used in this branch */
1257 space = k->nr_hwtail - k->nkr_hwlease;
1258 if (space < 0)
1259 space += k->nkr_num_slots;
1260 }
1261 #if 0
1262 // sanity check
1263 if (k->nkr_hwlease >= k->nkr_num_slots ||
1264 k->nr_hwcur >= k->nkr_num_slots ||
1265 k->nr_tail >= k->nkr_num_slots ||
1266 busy < 0 ||
1267 busy >= k->nkr_num_slots) {
1268 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1269 k->nkr_lease_idx, k->nkr_num_slots);
1270 }
1271 #endif
1272 return space;
1273 }
1274
1275
1276
1277
1278 /* make a lease on the kring for N positions. return the
1279 * lease index
1280 * XXX only used in VALE code and with is_rx = 1
1281 */
1282 static inline uint32_t
nm_kr_lease(struct netmap_kring * k,u_int n,int is_rx)1283 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1284 {
1285 uint32_t lim = k->nkr_num_slots - 1;
1286 uint32_t lease_idx = k->nkr_lease_idx;
1287
1288 k->nkr_leases[lease_idx] = NR_NOSLOT;
1289 k->nkr_lease_idx = nm_next(lease_idx, lim);
1290
1291 if (n > nm_kr_space(k, is_rx)) {
1292 D("invalid request for %d slots", n);
1293 panic("x");
1294 }
1295 /* XXX verify that there are n slots */
1296 k->nkr_hwlease += n;
1297 if (k->nkr_hwlease > lim)
1298 k->nkr_hwlease -= lim + 1;
1299
1300 if (k->nkr_hwlease >= k->nkr_num_slots ||
1301 k->nr_hwcur >= k->nkr_num_slots ||
1302 k->nr_hwtail >= k->nkr_num_slots ||
1303 k->nkr_lease_idx >= k->nkr_num_slots) {
1304 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1305 k->na->name,
1306 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1307 k->nkr_lease_idx, k->nkr_num_slots);
1308 }
1309 return lease_idx;
1310 }
1311
1312 /*
1313 *
1314 * This flush routine supports only unicast and broadcast but a large
1315 * number of ports, and lets us replace the learn and dispatch functions.
1316 */
1317 int
nm_bdg_flush(struct nm_bdg_fwd * ft,u_int n,struct netmap_vp_adapter * na,u_int ring_nr)1318 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1319 u_int ring_nr)
1320 {
1321 struct nm_bdg_q *dst_ents, *brddst;
1322 uint16_t num_dsts = 0, *dsts;
1323 struct nm_bridge *b = na->na_bdg;
1324 u_int i, j, me = na->bdg_port;
1325
1326 /*
1327 * The work area (pointed by ft) is followed by an array of
1328 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1329 * queues per port plus one for the broadcast traffic.
1330 * Then we have an array of destination indexes.
1331 */
1332 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1333 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1334
1335 /* first pass: find a destination for each packet in the batch */
1336 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1337 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1338 uint16_t dst_port, d_i;
1339 struct nm_bdg_q *d;
1340
1341 ND("slot %d frags %d", i, ft[i].ft_frags);
1342 /* Drop the packet if the virtio-net header is not into the first
1343 fragment nor at the very beginning of the second. */
1344 if (unlikely(na->virt_hdr_len > ft[i].ft_len))
1345 continue;
1346 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1347 if (netmap_verbose > 255)
1348 RD(5, "slot %d port %d -> %d", i, me, dst_port);
1349 if (dst_port == NM_BDG_NOPORT)
1350 continue; /* this packet is identified to be dropped */
1351 else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1352 continue;
1353 else if (dst_port == NM_BDG_BROADCAST)
1354 dst_ring = 0; /* broadcasts always go to ring 0 */
1355 else if (unlikely(dst_port == me ||
1356 !b->bdg_ports[dst_port]))
1357 continue;
1358
1359 /* get a position in the scratch pad */
1360 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1361 d = dst_ents + d_i;
1362
1363 /* append the first fragment to the list */
1364 if (d->bq_head == NM_FT_NULL) { /* new destination */
1365 d->bq_head = d->bq_tail = i;
1366 /* remember this position to be scanned later */
1367 if (dst_port != NM_BDG_BROADCAST)
1368 dsts[num_dsts++] = d_i;
1369 } else {
1370 ft[d->bq_tail].ft_next = i;
1371 d->bq_tail = i;
1372 }
1373 d->bq_len += ft[i].ft_frags;
1374 }
1375
1376 /*
1377 * Broadcast traffic goes to ring 0 on all destinations.
1378 * So we need to add these rings to the list of ports to scan.
1379 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1380 * expensive. We should keep a compact list of active destinations
1381 * so we could shorten this loop.
1382 */
1383 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1384 if (brddst->bq_head != NM_FT_NULL) {
1385 for (j = 0; likely(j < b->bdg_active_ports); j++) {
1386 uint16_t d_i;
1387 i = b->bdg_port_index[j];
1388 if (unlikely(i == me))
1389 continue;
1390 d_i = i * NM_BDG_MAXRINGS;
1391 if (dst_ents[d_i].bq_head == NM_FT_NULL)
1392 dsts[num_dsts++] = d_i;
1393 }
1394 }
1395
1396 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1397 /* second pass: scan destinations */
1398 for (i = 0; i < num_dsts; i++) {
1399 struct netmap_vp_adapter *dst_na;
1400 struct netmap_kring *kring;
1401 struct netmap_ring *ring;
1402 u_int dst_nr, lim, j, d_i, next, brd_next;
1403 u_int needed, howmany;
1404 int retry = netmap_txsync_retry;
1405 struct nm_bdg_q *d;
1406 uint32_t my_start = 0, lease_idx = 0;
1407 int nrings;
1408 int virt_hdr_mismatch = 0;
1409
1410 d_i = dsts[i];
1411 ND("second pass %d port %d", i, d_i);
1412 d = dst_ents + d_i;
1413 // XXX fix the division
1414 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1415 /* protect from the lookup function returning an inactive
1416 * destination port
1417 */
1418 if (unlikely(dst_na == NULL))
1419 goto cleanup;
1420 if (dst_na->up.na_flags & NAF_SW_ONLY)
1421 goto cleanup;
1422 /*
1423 * The interface may be in !netmap mode in two cases:
1424 * - when na is attached but not activated yet;
1425 * - when na is being deactivated but is still attached.
1426 */
1427 if (unlikely(!nm_netmap_on(&dst_na->up))) {
1428 ND("not in netmap mode!");
1429 goto cleanup;
1430 }
1431
1432 /* there is at least one either unicast or broadcast packet */
1433 brd_next = brddst->bq_head;
1434 next = d->bq_head;
1435 /* we need to reserve this many slots. If fewer are
1436 * available, some packets will be dropped.
1437 * Packets may have multiple fragments, so we may not use
1438 * there is a chance that we may not use all of the slots
1439 * we have claimed, so we will need to handle the leftover
1440 * ones when we regain the lock.
1441 */
1442 needed = d->bq_len + brddst->bq_len;
1443
1444 if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
1445 RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len);
1446 /* There is a virtio-net header/offloadings mismatch between
1447 * source and destination. The slower mismatch datapath will
1448 * be used to cope with all the mismatches.
1449 */
1450 virt_hdr_mismatch = 1;
1451 if (dst_na->mfs < na->mfs) {
1452 /* We may need to do segmentation offloadings, and so
1453 * we may need a number of destination slots greater
1454 * than the number of input slots ('needed').
1455 * We look for the smallest integer 'x' which satisfies:
1456 * needed * na->mfs + x * H <= x * na->mfs
1457 * where 'H' is the length of the longest header that may
1458 * be replicated in the segmentation process (e.g. for
1459 * TCPv4 we must account for ethernet header, IP header
1460 * and TCPv4 header).
1461 */
1462 needed = (needed * na->mfs) /
1463 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1464 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1465 }
1466 }
1467
1468 ND(5, "pass 2 dst %d is %x %s",
1469 i, d_i, is_vp ? "virtual" : "nic/host");
1470 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1471 nrings = dst_na->up.num_rx_rings;
1472 if (dst_nr >= nrings)
1473 dst_nr = dst_nr % nrings;
1474 kring = &dst_na->up.rx_rings[dst_nr];
1475 ring = kring->ring;
1476 lim = kring->nkr_num_slots - 1;
1477
1478 retry:
1479
1480 if (dst_na->retry && retry) {
1481 /* try to get some free slot from the previous run */
1482 kring->nm_notify(kring, 0);
1483 /* actually useful only for bwraps, since there
1484 * the notify will trigger a txsync on the hwna. VALE ports
1485 * have dst_na->retry == 0
1486 */
1487 }
1488 /* reserve the buffers in the queue and an entry
1489 * to report completion, and drop lock.
1490 * XXX this might become a helper function.
1491 */
1492 mtx_lock(&kring->q_lock);
1493 if (kring->nkr_stopped) {
1494 mtx_unlock(&kring->q_lock);
1495 goto cleanup;
1496 }
1497 my_start = j = kring->nkr_hwlease;
1498 howmany = nm_kr_space(kring, 1);
1499 if (needed < howmany)
1500 howmany = needed;
1501 lease_idx = nm_kr_lease(kring, howmany, 1);
1502 mtx_unlock(&kring->q_lock);
1503
1504 /* only retry if we need more than available slots */
1505 if (retry && needed <= howmany)
1506 retry = 0;
1507
1508 /* copy to the destination queue */
1509 while (howmany > 0) {
1510 struct netmap_slot *slot;
1511 struct nm_bdg_fwd *ft_p, *ft_end;
1512 u_int cnt;
1513
1514 /* find the queue from which we pick next packet.
1515 * NM_FT_NULL is always higher than valid indexes
1516 * so we never dereference it if the other list
1517 * has packets (and if both are empty we never
1518 * get here).
1519 */
1520 if (next < brd_next) {
1521 ft_p = ft + next;
1522 next = ft_p->ft_next;
1523 } else { /* insert broadcast */
1524 ft_p = ft + brd_next;
1525 brd_next = ft_p->ft_next;
1526 }
1527 cnt = ft_p->ft_frags; // cnt > 0
1528 if (unlikely(cnt > howmany))
1529 break; /* no more space */
1530 if (netmap_verbose && cnt > 1)
1531 RD(5, "rx %d frags to %d", cnt, j);
1532 ft_end = ft_p + cnt;
1533 if (unlikely(virt_hdr_mismatch)) {
1534 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1535 } else {
1536 howmany -= cnt;
1537 do {
1538 char *dst, *src = ft_p->ft_buf;
1539 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1540
1541 slot = &ring->slot[j];
1542 dst = NMB(&dst_na->up, slot);
1543
1544 ND("send [%d] %d(%d) bytes at %s:%d",
1545 i, (int)copy_len, (int)dst_len,
1546 NM_IFPNAME(dst_ifp), j);
1547 /* round to a multiple of 64 */
1548 copy_len = (copy_len + 63) & ~63;
1549
1550 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1551 copy_len > NETMAP_BUF_SIZE(&na->up))) {
1552 RD(5, "invalid len %d, down to 64", (int)copy_len);
1553 copy_len = dst_len = 64; // XXX
1554 }
1555 if (ft_p->ft_flags & NS_INDIRECT) {
1556 if (copyin(src, dst, copy_len)) {
1557 // invalid user pointer, pretend len is 0
1558 dst_len = 0;
1559 }
1560 } else {
1561 //memcpy(dst, src, copy_len);
1562 pkt_copy(src, dst, (int)copy_len);
1563 }
1564 slot->len = dst_len;
1565 slot->flags = (cnt << 8)| NS_MOREFRAG;
1566 j = nm_next(j, lim);
1567 needed--;
1568 ft_p++;
1569 } while (ft_p != ft_end);
1570 slot->flags = (cnt << 8); /* clear flag on last entry */
1571 }
1572 /* are we done ? */
1573 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1574 break;
1575 }
1576 {
1577 /* current position */
1578 uint32_t *p = kring->nkr_leases; /* shorthand */
1579 uint32_t update_pos;
1580 int still_locked = 1;
1581
1582 mtx_lock(&kring->q_lock);
1583 if (unlikely(howmany > 0)) {
1584 /* not used all bufs. If i am the last one
1585 * i can recover the slots, otherwise must
1586 * fill them with 0 to mark empty packets.
1587 */
1588 ND("leftover %d bufs", howmany);
1589 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1590 /* yes i am the last one */
1591 ND("roll back nkr_hwlease to %d", j);
1592 kring->nkr_hwlease = j;
1593 } else {
1594 while (howmany-- > 0) {
1595 ring->slot[j].len = 0;
1596 ring->slot[j].flags = 0;
1597 j = nm_next(j, lim);
1598 }
1599 }
1600 }
1601 p[lease_idx] = j; /* report I am done */
1602
1603 update_pos = kring->nr_hwtail;
1604
1605 if (my_start == update_pos) {
1606 /* all slots before my_start have been reported,
1607 * so scan subsequent leases to see if other ranges
1608 * have been completed, and to a selwakeup or txsync.
1609 */
1610 while (lease_idx != kring->nkr_lease_idx &&
1611 p[lease_idx] != NR_NOSLOT) {
1612 j = p[lease_idx];
1613 p[lease_idx] = NR_NOSLOT;
1614 lease_idx = nm_next(lease_idx, lim);
1615 }
1616 /* j is the new 'write' position. j != my_start
1617 * means there are new buffers to report
1618 */
1619 if (likely(j != my_start)) {
1620 kring->nr_hwtail = j;
1621 still_locked = 0;
1622 mtx_unlock(&kring->q_lock);
1623 kring->nm_notify(kring, 0);
1624 /* this is netmap_notify for VALE ports and
1625 * netmap_bwrap_notify for bwrap. The latter will
1626 * trigger a txsync on the underlying hwna
1627 */
1628 if (dst_na->retry && retry--) {
1629 /* XXX this is going to call nm_notify again.
1630 * Only useful for bwrap in virtual machines
1631 */
1632 goto retry;
1633 }
1634 }
1635 }
1636 if (still_locked)
1637 mtx_unlock(&kring->q_lock);
1638 }
1639 cleanup:
1640 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1641 d->bq_len = 0;
1642 }
1643 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1644 brddst->bq_len = 0;
1645 return 0;
1646 }
1647
1648 /* nm_txsync callback for VALE ports */
1649 static int
netmap_vp_txsync(struct netmap_kring * kring,int flags)1650 netmap_vp_txsync(struct netmap_kring *kring, int flags)
1651 {
1652 struct netmap_vp_adapter *na =
1653 (struct netmap_vp_adapter *)kring->na;
1654 u_int done;
1655 u_int const lim = kring->nkr_num_slots - 1;
1656 u_int const head = kring->rhead;
1657
1658 if (bridge_batch <= 0) { /* testing only */
1659 done = head; // used all
1660 goto done;
1661 }
1662 if (!na->na_bdg) {
1663 done = head;
1664 goto done;
1665 }
1666 if (bridge_batch > NM_BDG_BATCH)
1667 bridge_batch = NM_BDG_BATCH;
1668
1669 done = nm_bdg_preflush(kring, head);
1670 done:
1671 if (done != head)
1672 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1673 /*
1674 * packets between 'done' and 'cur' are left unsent.
1675 */
1676 kring->nr_hwcur = done;
1677 kring->nr_hwtail = nm_prev(done, lim);
1678 if (netmap_verbose)
1679 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1680 return 0;
1681 }
1682
1683
1684 /* rxsync code used by VALE ports nm_rxsync callback and also
1685 * internally by the brwap
1686 */
1687 static int
netmap_vp_rxsync_locked(struct netmap_kring * kring,int flags)1688 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
1689 {
1690 struct netmap_adapter *na = kring->na;
1691 struct netmap_ring *ring = kring->ring;
1692 u_int nm_i, lim = kring->nkr_num_slots - 1;
1693 u_int head = kring->rhead;
1694 int n;
1695
1696 if (head > lim) {
1697 D("ouch dangerous reset!!!");
1698 n = netmap_ring_reinit(kring);
1699 goto done;
1700 }
1701
1702 /* First part, import newly received packets. */
1703 /* actually nothing to do here, they are already in the kring */
1704
1705 /* Second part, skip past packets that userspace has released. */
1706 nm_i = kring->nr_hwcur;
1707 if (nm_i != head) {
1708 /* consistency check, but nothing really important here */
1709 for (n = 0; likely(nm_i != head); n++) {
1710 struct netmap_slot *slot = &ring->slot[nm_i];
1711 void *addr = NMB(na, slot);
1712
1713 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
1714 D("bad buffer index %d, ignore ?",
1715 slot->buf_idx);
1716 }
1717 slot->flags &= ~NS_BUF_CHANGED;
1718 nm_i = nm_next(nm_i, lim);
1719 }
1720 kring->nr_hwcur = head;
1721 }
1722
1723 n = 0;
1724 done:
1725 return n;
1726 }
1727
1728 /*
1729 * nm_rxsync callback for VALE ports
1730 * user process reading from a VALE switch.
1731 * Already protected against concurrent calls from userspace,
1732 * but we must acquire the queue's lock to protect against
1733 * writers on the same queue.
1734 */
1735 static int
netmap_vp_rxsync(struct netmap_kring * kring,int flags)1736 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1737 {
1738 int n;
1739
1740 mtx_lock(&kring->q_lock);
1741 n = netmap_vp_rxsync_locked(kring, flags);
1742 mtx_unlock(&kring->q_lock);
1743 return n;
1744 }
1745
1746
1747 /* nm_bdg_attach callback for VALE ports
1748 * The na_vp port is this same netmap_adapter. There is no host port.
1749 */
1750 static int
netmap_vp_bdg_attach(const char * name,struct netmap_adapter * na)1751 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
1752 {
1753 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1754
1755 if (vpna->na_bdg)
1756 return EBUSY;
1757 na->na_vp = vpna;
1758 strncpy(na->name, name, sizeof(na->name));
1759 na->na_hostvp = NULL;
1760 return 0;
1761 }
1762
1763 /* create a netmap_vp_adapter that describes a VALE port.
1764 * Only persistent VALE ports have a non-null ifp.
1765 */
1766 static int
netmap_vp_create(struct nmreq * nmr,struct ifnet * ifp,struct netmap_vp_adapter ** ret)1767 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret)
1768 {
1769 struct netmap_vp_adapter *vpna;
1770 struct netmap_adapter *na;
1771 int error;
1772 u_int npipes = 0;
1773
1774 vpna = malloc(sizeof(*vpna), M_NETMAP, M_NOWAIT | M_ZERO);
1775 if (vpna == NULL)
1776 return ENOMEM;
1777
1778 na = &vpna->up;
1779
1780 na->ifp = ifp;
1781 strncpy(na->name, nmr->nr_name, sizeof(na->name));
1782
1783 /* bound checking */
1784 na->num_tx_rings = nmr->nr_tx_rings;
1785 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1786 nmr->nr_tx_rings = na->num_tx_rings; // write back
1787 na->num_rx_rings = nmr->nr_rx_rings;
1788 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1789 nmr->nr_rx_rings = na->num_rx_rings; // write back
1790 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1791 1, NM_BDG_MAXSLOTS, NULL);
1792 na->num_tx_desc = nmr->nr_tx_slots;
1793 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1794 1, NM_BDG_MAXSLOTS, NULL);
1795 /* validate number of pipes. We want at least 1,
1796 * but probably can do with some more.
1797 * So let's use 2 as default (when 0 is supplied)
1798 */
1799 npipes = nmr->nr_arg1;
1800 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1801 nmr->nr_arg1 = npipes; /* write back */
1802 /* validate extra bufs */
1803 nm_bound_var(&nmr->nr_arg3, 0, 0,
1804 128*NM_BDG_MAXSLOTS, NULL);
1805 na->num_rx_desc = nmr->nr_rx_slots;
1806 vpna->virt_hdr_len = 0;
1807 vpna->mfs = 1514;
1808 vpna->last_smac = ~0llu;
1809 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
1810 vpna->mfs = netmap_buf_size; */
1811 if (netmap_verbose)
1812 D("max frame size %u", vpna->mfs);
1813
1814 na->na_flags |= NAF_BDG_MAYSLEEP;
1815 /* persistent VALE ports look like hw devices
1816 * with a native netmap adapter
1817 */
1818 if (ifp)
1819 na->na_flags |= NAF_NATIVE;
1820 na->nm_txsync = netmap_vp_txsync;
1821 na->nm_rxsync = netmap_vp_rxsync;
1822 na->nm_register = netmap_vp_reg;
1823 na->nm_krings_create = netmap_vp_krings_create;
1824 na->nm_krings_delete = netmap_vp_krings_delete;
1825 na->nm_dtor = netmap_vp_dtor;
1826 na->nm_mem = netmap_mem_private_new(na->name,
1827 na->num_tx_rings, na->num_tx_desc,
1828 na->num_rx_rings, na->num_rx_desc,
1829 nmr->nr_arg3, npipes, &error);
1830 if (na->nm_mem == NULL)
1831 goto err;
1832 na->nm_bdg_attach = netmap_vp_bdg_attach;
1833 /* other nmd fields are set in the common routine */
1834 error = netmap_attach_common(na);
1835 if (error)
1836 goto err;
1837 *ret = vpna;
1838 return 0;
1839
1840 err:
1841 if (na->nm_mem != NULL)
1842 netmap_mem_delete(na->nm_mem);
1843 free(vpna, M_NETMAP);
1844 return error;
1845 }
1846
1847 /* Bridge wrapper code (bwrap).
1848 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1849 * VALE switch.
1850 * The main task is to swap the meaning of tx and rx rings to match the
1851 * expectations of the VALE switch code (see nm_bdg_flush).
1852 *
1853 * The bwrap works by interposing a netmap_bwrap_adapter between the
1854 * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1855 * a netmap_vp_adapter to the rest the system, but, internally, it
1856 * translates all callbacks to what the hwna expects.
1857 *
1858 * Note that we have to intercept callbacks coming from two sides:
1859 *
1860 * - callbacks coming from the netmap module are intercepted by
1861 * passing around the netmap_bwrap_adapter instead of the hwna
1862 *
1863 * - callbacks coming from outside of the netmap module only know
1864 * about the hwna. This, however, only happens in interrupt
1865 * handlers, where only the hwna->nm_notify callback is called.
1866 * What the bwrap does is to overwrite the hwna->nm_notify callback
1867 * with its own netmap_bwrap_intr_notify.
1868 * XXX This assumes that the hwna->nm_notify callback was the
1869 * standard netmap_notify(), as it is the case for nic adapters.
1870 * Any additional action performed by hwna->nm_notify will not be
1871 * performed by netmap_bwrap_intr_notify.
1872 *
1873 * Additionally, the bwrap can optionally attach the host rings pair
1874 * of the wrapped adapter to a different port of the switch.
1875 */
1876
1877
1878 static void
netmap_bwrap_dtor(struct netmap_adapter * na)1879 netmap_bwrap_dtor(struct netmap_adapter *na)
1880 {
1881 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1882 struct netmap_adapter *hwna = bna->hwna;
1883
1884 ND("na %p", na);
1885 /* drop reference to hwna->ifp.
1886 * If we don't do this, netmap_detach_common(na)
1887 * will think it has set NA(na->ifp) to NULL
1888 */
1889 na->ifp = NULL;
1890 /* for safety, also drop the possible reference
1891 * in the hostna
1892 */
1893 bna->host.up.ifp = NULL;
1894
1895 hwna->nm_mem = bna->save_nmd;
1896 hwna->na_private = NULL;
1897 hwna->na_vp = hwna->na_hostvp = NULL;
1898 hwna->na_flags &= ~NAF_BUSY;
1899 netmap_adapter_put(hwna);
1900
1901 }
1902
1903
1904 /*
1905 * Intr callback for NICs connected to a bridge.
1906 * Simply ignore tx interrupts (maybe we could try to recover space ?)
1907 * and pass received packets from nic to the bridge.
1908 *
1909 * XXX TODO check locking: this is called from the interrupt
1910 * handler so we should make sure that the interface is not
1911 * disconnected while passing down an interrupt.
1912 *
1913 * Note, no user process can access this NIC or the host stack.
1914 * The only part of the ring that is significant are the slots,
1915 * and head/cur/tail are set from the kring as needed
1916 * (part as a receive ring, part as a transmit ring).
1917 *
1918 * callback that overwrites the hwna notify callback.
1919 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1920 * The bridge wrapper then sends the packets through the bridge.
1921 */
1922 static int
netmap_bwrap_intr_notify(struct netmap_kring * kring,int flags)1923 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1924 {
1925 struct netmap_adapter *na = kring->na;
1926 struct netmap_bwrap_adapter *bna = na->na_private;
1927 struct netmap_kring *bkring;
1928 struct netmap_vp_adapter *vpna = &bna->up;
1929 u_int ring_nr = kring->ring_id;
1930 int error = 0;
1931
1932 if (netmap_verbose)
1933 D("%s %s 0x%x", na->name, kring->name, flags);
1934
1935 if (!nm_netmap_on(na))
1936 return 0;
1937
1938 bkring = &vpna->up.tx_rings[ring_nr];
1939
1940 /* make sure the ring is not disabled */
1941 if (nm_kr_tryget(kring))
1942 return 0;
1943
1944 if (netmap_verbose)
1945 D("%s head %d cur %d tail %d", na->name,
1946 kring->rhead, kring->rcur, kring->rtail);
1947
1948 /* simulate a user wakeup on the rx ring
1949 * fetch packets that have arrived.
1950 */
1951 error = kring->nm_sync(kring, 0);
1952 if (error)
1953 goto put_out;
1954 if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1955 D("how strange, interrupt with no packets on %s",
1956 na->name);
1957 goto put_out;
1958 }
1959
1960 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1961 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1962 * to push all packets out.
1963 */
1964 bkring->rhead = bkring->rcur = kring->nr_hwtail;
1965
1966 netmap_vp_txsync(bkring, flags);
1967
1968 /* mark all buffers as released on this ring */
1969 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1970 /* another call to actually release the buffers */
1971 error = kring->nm_sync(kring, 0);
1972
1973 put_out:
1974 nm_kr_put(kring);
1975 return error;
1976 }
1977
1978
1979 /* nm_register callback for bwrap */
1980 static int
netmap_bwrap_register(struct netmap_adapter * na,int onoff)1981 netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1982 {
1983 struct netmap_bwrap_adapter *bna =
1984 (struct netmap_bwrap_adapter *)na;
1985 struct netmap_adapter *hwna = bna->hwna;
1986 struct netmap_vp_adapter *hostna = &bna->host;
1987 int error;
1988 enum txrx t;
1989
1990 ND("%s %s", na->name, onoff ? "on" : "off");
1991
1992 if (onoff) {
1993 int i;
1994
1995 /* netmap_do_regif has been called on the bwrap na.
1996 * We need to pass the information about the
1997 * memory allocator down to the hwna before
1998 * putting it in netmap mode
1999 */
2000 hwna->na_lut = na->na_lut;
2001
2002 if (hostna->na_bdg) {
2003 /* if the host rings have been attached to switch,
2004 * we need to copy the memory allocator information
2005 * in the hostna also
2006 */
2007 hostna->up.na_lut = na->na_lut;
2008 }
2009
2010 /* cross-link the netmap rings
2011 * The original number of rings comes from hwna,
2012 * rx rings on one side equals tx rings on the other.
2013 * We need to do this now, after the initialization
2014 * of the kring->ring pointers
2015 */
2016 for_rx_tx(t) {
2017 enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2018 for (i = 0; i < nma_get_nrings(na, r) + 1; i++) {
2019 NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots;
2020 NMR(hwna, t)[i].ring = NMR(na, r)[i].ring;
2021 }
2022 }
2023 }
2024
2025 /* forward the request to the hwna */
2026 error = hwna->nm_register(hwna, onoff);
2027 if (error)
2028 return error;
2029
2030 /* impersonate a netmap_vp_adapter */
2031 netmap_vp_reg(na, onoff);
2032 if (hostna->na_bdg)
2033 netmap_vp_reg(&hostna->up, onoff);
2034
2035 if (onoff) {
2036 u_int i;
2037 /* intercept the hwna nm_nofify callback on the hw rings */
2038 for (i = 0; i < hwna->num_rx_rings; i++) {
2039 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2040 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2041 }
2042 i = hwna->num_rx_rings; /* for safety */
2043 /* save the host ring notify unconditionally */
2044 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2045 if (hostna->na_bdg) {
2046 /* also intercept the host ring notify */
2047 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2048 }
2049 } else {
2050 u_int i;
2051 /* reset all notify callbacks (including host ring) */
2052 for (i = 0; i <= hwna->num_rx_rings; i++) {
2053 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2054 hwna->rx_rings[i].save_notify = NULL;
2055 }
2056 hwna->na_lut.lut = NULL;
2057 hwna->na_lut.objtotal = 0;
2058 hwna->na_lut.objsize = 0;
2059 }
2060
2061 return 0;
2062 }
2063
2064 /* nm_config callback for bwrap */
2065 static int
netmap_bwrap_config(struct netmap_adapter * na,u_int * txr,u_int * txd,u_int * rxr,u_int * rxd)2066 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2067 u_int *rxr, u_int *rxd)
2068 {
2069 struct netmap_bwrap_adapter *bna =
2070 (struct netmap_bwrap_adapter *)na;
2071 struct netmap_adapter *hwna = bna->hwna;
2072
2073 /* forward the request */
2074 netmap_update_config(hwna);
2075 /* swap the results */
2076 *txr = hwna->num_rx_rings;
2077 *txd = hwna->num_rx_desc;
2078 *rxr = hwna->num_tx_rings;
2079 *rxd = hwna->num_rx_desc;
2080
2081 return 0;
2082 }
2083
2084
2085 /* nm_krings_create callback for bwrap */
2086 static int
netmap_bwrap_krings_create(struct netmap_adapter * na)2087 netmap_bwrap_krings_create(struct netmap_adapter *na)
2088 {
2089 struct netmap_bwrap_adapter *bna =
2090 (struct netmap_bwrap_adapter *)na;
2091 struct netmap_adapter *hwna = bna->hwna;
2092 struct netmap_adapter *hostna = &bna->host.up;
2093 int error;
2094
2095 ND("%s", na->name);
2096
2097 /* impersonate a netmap_vp_adapter */
2098 error = netmap_vp_krings_create(na);
2099 if (error)
2100 return error;
2101
2102 /* also create the hwna krings */
2103 error = hwna->nm_krings_create(hwna);
2104 if (error) {
2105 netmap_vp_krings_delete(na);
2106 return error;
2107 }
2108 /* the connection between the bwrap krings and the hwna krings
2109 * will be perfomed later, in the nm_register callback, since
2110 * now the kring->ring pointers have not been initialized yet
2111 */
2112
2113 if (na->na_flags & NAF_HOST_RINGS) {
2114 /* the hostna rings are the host rings of the bwrap.
2115 * The corresponding krings must point back to the
2116 * hostna
2117 */
2118 hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
2119 hostna->tx_rings[0].na = hostna;
2120 hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
2121 hostna->rx_rings[0].na = hostna;
2122 }
2123
2124 return 0;
2125 }
2126
2127
2128 static void
netmap_bwrap_krings_delete(struct netmap_adapter * na)2129 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2130 {
2131 struct netmap_bwrap_adapter *bna =
2132 (struct netmap_bwrap_adapter *)na;
2133 struct netmap_adapter *hwna = bna->hwna;
2134
2135 ND("%s", na->name);
2136
2137 hwna->nm_krings_delete(hwna);
2138 netmap_vp_krings_delete(na);
2139 }
2140
2141
2142 /* notify method for the bridge-->hwna direction */
2143 static int
netmap_bwrap_notify(struct netmap_kring * kring,int flags)2144 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2145 {
2146 struct netmap_adapter *na = kring->na;
2147 struct netmap_bwrap_adapter *bna = na->na_private;
2148 struct netmap_adapter *hwna = bna->hwna;
2149 u_int ring_n = kring->ring_id;
2150 u_int lim = kring->nkr_num_slots - 1;
2151 struct netmap_kring *hw_kring;
2152 int error = 0;
2153
2154 ND("%s: na %s hwna %s",
2155 (kring ? kring->name : "NULL!"),
2156 (na ? na->name : "NULL!"),
2157 (hwna ? hwna->name : "NULL!"));
2158 hw_kring = &hwna->tx_rings[ring_n];
2159
2160 if (nm_kr_tryget(hw_kring))
2161 return 0;
2162
2163 if (!nm_netmap_on(hwna))
2164 return 0;
2165 /* first step: simulate a user wakeup on the rx ring */
2166 netmap_vp_rxsync(kring, flags);
2167 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2168 na->name, ring_n,
2169 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2170 ring->head, ring->cur, ring->tail,
2171 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2172 /* second step: the new packets are sent on the tx ring
2173 * (which is actually the same ring)
2174 */
2175 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2176 error = hw_kring->nm_sync(hw_kring, flags);
2177 if (error)
2178 goto out;
2179
2180 /* third step: now we are back the rx ring */
2181 /* claim ownership on all hw owned bufs */
2182 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2183
2184 /* fourth step: the user goes to sleep again, causing another rxsync */
2185 netmap_vp_rxsync(kring, flags);
2186 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2187 na->name, ring_n,
2188 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2189 ring->head, ring->cur, ring->tail,
2190 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2191 out:
2192 nm_kr_put(hw_kring);
2193 return error;
2194 }
2195
2196
2197 /* nm_bdg_ctl callback for the bwrap.
2198 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2199 * On attach, it needs to provide a fake netmap_priv_d structure and
2200 * perform a netmap_do_regif() on the bwrap. This will put both the
2201 * bwrap and the hwna in netmap mode, with the netmap rings shared
2202 * and cross linked. Moroever, it will start intercepting interrupts
2203 * directed to hwna.
2204 */
2205 static int
netmap_bwrap_bdg_ctl(struct netmap_adapter * na,struct nmreq * nmr,int attach)2206 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2207 {
2208 struct netmap_priv_d *npriv;
2209 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2210 int error = 0;
2211
2212 if (attach) {
2213 if (NETMAP_OWNED_BY_ANY(na)) {
2214 return EBUSY;
2215 }
2216 if (bna->na_kpriv) {
2217 /* nothing to do */
2218 return 0;
2219 }
2220 npriv = malloc(sizeof(*npriv), M_NETMAP, M_NOWAIT|M_ZERO);
2221 if (npriv == NULL)
2222 return ENOMEM;
2223 error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags);
2224 if (error) {
2225 bzero(npriv, sizeof(*npriv));
2226 free(npriv, M_NETMAP);
2227 return error;
2228 }
2229 bna->na_kpriv = npriv;
2230 na->na_flags |= NAF_BUSY;
2231 } else {
2232 int last_instance;
2233
2234 if (na->active_fds == 0) /* not registered */
2235 return EINVAL;
2236 last_instance = netmap_dtor_locked(bna->na_kpriv);
2237 if (!last_instance) {
2238 D("--- error, trying to detach an entry with active mmaps");
2239 error = EINVAL;
2240 } else {
2241 struct nm_bridge *b = bna->up.na_bdg,
2242 *bh = bna->host.na_bdg;
2243 npriv = bna->na_kpriv;
2244 bna->na_kpriv = NULL;
2245 D("deleting priv");
2246
2247 bzero(npriv, sizeof(*npriv));
2248 free(npriv, M_NETMAP);
2249 if (b) {
2250 /* XXX the bwrap dtor should take care
2251 * of this (2014-06-16)
2252 */
2253 netmap_bdg_detach_common(b, bna->up.bdg_port,
2254 (bh ? bna->host.bdg_port : -1));
2255 }
2256 na->na_flags &= ~NAF_BUSY;
2257 }
2258 }
2259 return error;
2260
2261 }
2262
2263 /* attach a bridge wrapper to the 'real' device */
2264 int
netmap_bwrap_attach(const char * nr_name,struct netmap_adapter * hwna)2265 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2266 {
2267 struct netmap_bwrap_adapter *bna;
2268 struct netmap_adapter *na = NULL;
2269 struct netmap_adapter *hostna = NULL;
2270 int error = 0;
2271 enum txrx t;
2272
2273 /* make sure the NIC is not already in use */
2274 if (NETMAP_OWNED_BY_ANY(hwna)) {
2275 D("NIC %s busy, cannot attach to bridge", hwna->name);
2276 return EBUSY;
2277 }
2278
2279 bna = malloc(sizeof(*bna), M_NETMAP, M_NOWAIT | M_ZERO);
2280 if (bna == NULL) {
2281 return ENOMEM;
2282 }
2283
2284 na = &bna->up.up;
2285 na->na_private = bna;
2286 strncpy(na->name, nr_name, sizeof(na->name));
2287 /* fill the ring data for the bwrap adapter with rx/tx meanings
2288 * swapped. The real cross-linking will be done during register,
2289 * when all the krings will have been created.
2290 */
2291 for_rx_tx(t) {
2292 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2293 nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2294 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2295 }
2296 na->nm_dtor = netmap_bwrap_dtor;
2297 na->nm_register = netmap_bwrap_register;
2298 // na->nm_txsync = netmap_bwrap_txsync;
2299 // na->nm_rxsync = netmap_bwrap_rxsync;
2300 na->nm_config = netmap_bwrap_config;
2301 na->nm_krings_create = netmap_bwrap_krings_create;
2302 na->nm_krings_delete = netmap_bwrap_krings_delete;
2303 na->nm_notify = netmap_bwrap_notify;
2304 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2305 na->pdev = hwna->pdev;
2306 na->nm_mem = netmap_mem_private_new(na->name,
2307 na->num_tx_rings, na->num_tx_desc,
2308 na->num_rx_rings, na->num_rx_desc,
2309 0, 0, &error);
2310 na->na_flags |= NAF_MEM_OWNER;
2311 if (na->nm_mem == NULL)
2312 goto err_put;
2313 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2314
2315 bna->hwna = hwna;
2316 netmap_adapter_get(hwna);
2317 hwna->na_private = bna; /* weak reference */
2318 hwna->na_vp = &bna->up;
2319
2320 if (hwna->na_flags & NAF_HOST_RINGS) {
2321 if (hwna->na_flags & NAF_SW_ONLY)
2322 na->na_flags |= NAF_SW_ONLY;
2323 na->na_flags |= NAF_HOST_RINGS;
2324 hostna = &bna->host.up;
2325 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2326 hostna->ifp = hwna->ifp;
2327 for_rx_tx(t) {
2328 enum txrx r = nm_txrx_swap(t);
2329 nma_set_nrings(hostna, t, 1);
2330 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2331 }
2332 // hostna->nm_txsync = netmap_bwrap_host_txsync;
2333 // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2334 hostna->nm_notify = netmap_bwrap_notify;
2335 hostna->nm_mem = na->nm_mem;
2336 hostna->na_private = bna;
2337 hostna->na_vp = &bna->up;
2338 na->na_hostvp = hwna->na_hostvp =
2339 hostna->na_hostvp = &bna->host;
2340 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2341 }
2342
2343 ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2344 na->name, ifp->if_xname,
2345 na->num_tx_rings, na->num_tx_desc,
2346 na->num_rx_rings, na->num_rx_desc);
2347
2348 error = netmap_attach_common(na);
2349 if (error) {
2350 goto err_free;
2351 }
2352 /* make bwrap ifp point to the real ifp
2353 * NOTE: netmap_attach_common() interprets a non-NULL na->ifp
2354 * as a request to make the ifp point to the na. Since we
2355 * do not want to change the na already pointed to by hwna->ifp,
2356 * the following assignment has to be delayed until now
2357 */
2358 na->ifp = hwna->ifp;
2359 hwna->na_flags |= NAF_BUSY;
2360 /* make hwna point to the allocator we are actually using,
2361 * so that monitors will be able to find it
2362 */
2363 bna->save_nmd = hwna->nm_mem;
2364 hwna->nm_mem = na->nm_mem;
2365 return 0;
2366
2367 err_free:
2368 netmap_mem_delete(na->nm_mem);
2369 err_put:
2370 hwna->na_vp = hwna->na_hostvp = NULL;
2371 netmap_adapter_put(hwna);
2372 free(bna, M_NETMAP);
2373 return error;
2374
2375 }
2376
2377 struct nm_bridge *
netmap_init_bridges2(u_int n)2378 netmap_init_bridges2(u_int n)
2379 {
2380 int i;
2381 struct nm_bridge *b;
2382
2383 b = malloc(sizeof(struct nm_bridge) * n, M_NETMAP,
2384 M_NOWAIT | M_ZERO);
2385 if (b == NULL)
2386 return NULL;
2387 for (i = 0; i < n; i++)
2388 BDG_RWINIT(&b[i]);
2389 return b;
2390 }
2391
2392 void
netmap_uninit_bridges2(struct nm_bridge * b,u_int n)2393 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2394 {
2395 int i;
2396
2397 if (b == NULL)
2398 return;
2399
2400 for (i = 0; i < n; i++)
2401 BDG_RWDESTROY(&b[i]);
2402 free(b, M_NETMAP);
2403 }
2404
2405 int
netmap_init_bridges(void)2406 netmap_init_bridges(void)
2407 {
2408 #ifdef CONFIG_NET_NS
2409 return netmap_bns_register();
2410 #else
2411 nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2412 if (nm_bridges == NULL)
2413 return ENOMEM;
2414 return 0;
2415 #endif
2416 }
2417
2418 void
netmap_uninit_bridges(void)2419 netmap_uninit_bridges(void)
2420 {
2421 #ifdef CONFIG_NET_NS
2422 netmap_bns_unregister();
2423 #else
2424 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2425 #endif
2426 }
2427 #endif /* WITH_VALE */
2428