xref: /freebsd-13-stable/sys/dev/netmap/netmap_bdg.c (revision 3bc80996974a61a4223eae4c1ccd47b6ee32a48a)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 #include <sys/types.h>
62 #include <sys/errno.h>
63 #include <sys/param.h>	/* defines used in kernel.h */
64 #include <sys/kernel.h>	/* types used in module initialization */
65 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
66 #include <sys/sockio.h>
67 #include <sys/socketvar.h>	/* struct socket */
68 #include <sys/malloc.h>
69 #include <sys/poll.h>
70 #include <sys/rwlock.h>
71 #include <sys/socket.h> /* sockaddrs */
72 #include <sys/selinfo.h>
73 #include <sys/sysctl.h>
74 #include <net/if.h>
75 #include <net/if_var.h>
76 #include <net/bpf.h>		/* BIOCIMMEDIATE */
77 #include <machine/bus.h>	/* bus_dmamap_* */
78 #include <sys/endian.h>
79 #include <sys/refcount.h>
80 #include <sys/smp.h>
81 
82 
83 #elif defined(linux)
84 
85 #include "bsd_glue.h"
86 
87 #elif defined(__APPLE__)
88 
89 #warning OSX support is only partial
90 #include "osx_glue.h"
91 
92 #elif defined(_WIN32)
93 #include "win_glue.h"
94 
95 #else
96 
97 #error	Unsupported platform
98 
99 #endif /* unsupported */
100 
101 /*
102  * common headers
103  */
104 
105 #include <net/netmap.h>
106 #include <dev/netmap/netmap_kern.h>
107 #include <dev/netmap/netmap_mem2.h>
108 
109 #include <dev/netmap/netmap_bdg.h>
110 
111 const char*
netmap_bdg_name(struct netmap_vp_adapter * vp)112 netmap_bdg_name(struct netmap_vp_adapter *vp)
113 {
114 	struct nm_bridge *b = vp->na_bdg;
115 	if (b == NULL)
116 		return NULL;
117 	return b->bdg_basename;
118 }
119 
120 
121 #ifndef CONFIG_NET_NS
122 /*
123  * XXX in principle nm_bridges could be created dynamically
124  * Right now we have a static array and deletions are protected
125  * by an exclusive lock.
126  */
127 struct nm_bridge *nm_bridges;
128 #endif /* !CONFIG_NET_NS */
129 
130 
131 static int
nm_is_id_char(const char c)132 nm_is_id_char(const char c)
133 {
134 	return (c >= 'a' && c <= 'z') ||
135 	       (c >= 'A' && c <= 'Z') ||
136 	       (c >= '0' && c <= '9') ||
137 	       (c == '_');
138 }
139 
140 /* Validate the name of a bdg port and return the
141  * position of the ":" character. */
142 static int
nm_bdg_name_validate(const char * name,size_t prefixlen)143 nm_bdg_name_validate(const char *name, size_t prefixlen)
144 {
145 	int colon_pos = -1;
146 	int i;
147 
148 	if (!name || strlen(name) < prefixlen) {
149 		return -1;
150 	}
151 
152 	for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
153 		if (name[i] == ':') {
154 			colon_pos = i;
155 			break;
156 		} else if (!nm_is_id_char(name[i])) {
157 			return -1;
158 		}
159 	}
160 
161 	if (strlen(name) - colon_pos > IFNAMSIZ) {
162 		/* interface name too long */
163 		return -1;
164 	}
165 
166 	return colon_pos;
167 }
168 
169 /*
170  * locate a bridge among the existing ones.
171  * MUST BE CALLED WITH NMG_LOCK()
172  *
173  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
174  * We assume that this is called with a name of at least NM_NAME chars.
175  */
176 struct nm_bridge *
nm_find_bridge(const char * name,int create,struct netmap_bdg_ops * ops)177 nm_find_bridge(const char *name, int create, struct netmap_bdg_ops *ops)
178 {
179 	int i, namelen;
180 	struct nm_bridge *b = NULL, *bridges;
181 	u_int num_bridges;
182 
183 	NMG_LOCK_ASSERT();
184 
185 	netmap_bns_getbridges(&bridges, &num_bridges);
186 
187 	namelen = nm_bdg_name_validate(name,
188 			(ops != NULL ? strlen(ops->name) : 0));
189 	if (namelen < 0) {
190 		nm_prerr("invalid bridge name %s", name ? name : NULL);
191 		return NULL;
192 	}
193 
194 	/* lookup the name, remember empty slot if there is one */
195 	for (i = 0; i < num_bridges; i++) {
196 		struct nm_bridge *x = bridges + i;
197 
198 		if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
199 			if (create && b == NULL)
200 				b = x;	/* record empty slot */
201 		} else if (x->bdg_namelen != namelen) {
202 			continue;
203 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
204 			nm_prdis("found '%.*s' at %d", namelen, name, i);
205 			b = x;
206 			break;
207 		}
208 	}
209 	if (i == num_bridges && b) { /* name not found, can create entry */
210 		/* initialize the bridge */
211 		nm_prdis("create new bridge %s with ports %d", b->bdg_basename,
212 			b->bdg_active_ports);
213 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
214 		if (b->ht == NULL) {
215 			nm_prerr("failed to allocate hash table");
216 			return NULL;
217 		}
218 		strncpy(b->bdg_basename, name, namelen);
219 		b->bdg_namelen = namelen;
220 		b->bdg_active_ports = 0;
221 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
222 			b->bdg_port_index[i] = i;
223 		/* set the default function */
224 		b->bdg_ops = b->bdg_saved_ops = *ops;
225 		b->private_data = b->ht;
226 		b->bdg_flags = 0;
227 		NM_BNS_GET(b);
228 	}
229 	return b;
230 }
231 
232 
233 int
netmap_bdg_free(struct nm_bridge * b)234 netmap_bdg_free(struct nm_bridge *b)
235 {
236 	if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
237 		return EBUSY;
238 	}
239 
240 	nm_prdis("marking bridge %s as free", b->bdg_basename);
241 	nm_os_free(b->ht);
242 	memset(&b->bdg_ops, 0, sizeof(b->bdg_ops));
243 	memset(&b->bdg_saved_ops, 0, sizeof(b->bdg_saved_ops));
244 	b->bdg_flags = 0;
245 	NM_BNS_PUT(b);
246 	return 0;
247 }
248 
249 /* Called by external kernel modules (e.g., Openvswitch).
250  * to modify the private data previously given to regops().
251  * 'name' may be just bridge's name (including ':' if it
252  * is not just NM_BDG_NAME).
253  * Called without NMG_LOCK.
254  */
255 int
netmap_bdg_update_private_data(const char * name,bdg_update_private_data_fn_t callback,void * callback_data,void * auth_token)256 netmap_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
257 	void *callback_data, void *auth_token)
258 {
259 	void *private_data = NULL;
260 	struct nm_bridge *b;
261 	int error = 0;
262 
263 	NMG_LOCK();
264 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
265 	if (!b) {
266 		error = EINVAL;
267 		goto unlock_update_priv;
268 	}
269 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
270 		error = EACCES;
271 		goto unlock_update_priv;
272 	}
273 	BDG_WLOCK(b);
274 	private_data = callback(b->private_data, callback_data, &error);
275 	b->private_data = private_data;
276 	BDG_WUNLOCK(b);
277 
278 unlock_update_priv:
279 	NMG_UNLOCK();
280 	return error;
281 }
282 
283 
284 
285 /* remove from bridge b the ports in slots hw and sw
286  * (sw can be -1 if not needed)
287  */
288 void
netmap_bdg_detach_common(struct nm_bridge * b,int hw,int sw)289 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
290 {
291 	int s_hw = hw, s_sw = sw;
292 	int i, lim =b->bdg_active_ports;
293 	uint32_t *tmp = b->tmp_bdg_port_index;
294 
295 	/*
296 	New algorithm:
297 	make a copy of bdg_port_index;
298 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
299 	in the array of bdg_port_index, replacing them with
300 	entries from the bottom of the array;
301 	decrement bdg_active_ports;
302 	acquire BDG_WLOCK() and copy back the array.
303 	 */
304 
305 	if (netmap_debug & NM_DEBUG_BDG)
306 		nm_prinf("detach %d and %d (lim %d)", hw, sw, lim);
307 	/* make a copy of the list of active ports, update it,
308 	 * and then copy back within BDG_WLOCK().
309 	 */
310 	memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
311 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
312 		if (hw >= 0 && tmp[i] == hw) {
313 			nm_prdis("detach hw %d at %d", hw, i);
314 			lim--; /* point to last active port */
315 			tmp[i] = tmp[lim]; /* swap with i */
316 			tmp[lim] = hw;	/* now this is inactive */
317 			hw = -1;
318 		} else if (sw >= 0 && tmp[i] == sw) {
319 			nm_prdis("detach sw %d at %d", sw, i);
320 			lim--;
321 			tmp[i] = tmp[lim];
322 			tmp[lim] = sw;
323 			sw = -1;
324 		} else {
325 			i++;
326 		}
327 	}
328 	if (hw >= 0 || sw >= 0) {
329 		nm_prerr("delete failed hw %d sw %d, should panic...", hw, sw);
330 	}
331 
332 	BDG_WLOCK(b);
333 	if (b->bdg_ops.dtor)
334 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
335 	b->bdg_ports[s_hw] = NULL;
336 	if (s_sw >= 0) {
337 		b->bdg_ports[s_sw] = NULL;
338 	}
339 	memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
340 	b->bdg_active_ports = lim;
341 	BDG_WUNLOCK(b);
342 
343 	nm_prdis("now %d active ports", lim);
344 	netmap_bdg_free(b);
345 }
346 
347 
348 /* nm_bdg_ctl callback for VALE ports */
349 int
netmap_vp_bdg_ctl(struct nmreq_header * hdr,struct netmap_adapter * na)350 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
351 {
352 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
353 	struct nm_bridge *b = vpna->na_bdg;
354 
355 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
356 		return 0; /* nothing to do */
357 	}
358 	if (b) {
359 		netmap_set_all_rings(na, 0 /* disable */);
360 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
361 		vpna->na_bdg = NULL;
362 		netmap_set_all_rings(na, 1 /* enable */);
363 	}
364 	/* I have took reference just for attach */
365 	netmap_adapter_put(na);
366 	return 0;
367 }
368 
369 int
netmap_default_bdg_attach(const char * name,struct netmap_adapter * na,struct nm_bridge * b)370 netmap_default_bdg_attach(const char *name, struct netmap_adapter *na,
371 		struct nm_bridge *b)
372 {
373 	return NM_NEED_BWRAP;
374 }
375 
376 /* Try to get a reference to a netmap adapter attached to a VALE switch.
377  * If the adapter is found (or is created), this function returns 0, a
378  * non NULL pointer is returned into *na, and the caller holds a
379  * reference to the adapter.
380  * If an adapter is not found, then no reference is grabbed and the
381  * function returns an error code, or 0 if there is just a VALE prefix
382  * mismatch. Therefore the caller holds a reference when
383  * (*na != NULL && return == 0).
384  */
385 int
netmap_get_bdg_na(struct nmreq_header * hdr,struct netmap_adapter ** na,struct netmap_mem_d * nmd,int create,struct netmap_bdg_ops * ops)386 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
387 	struct netmap_mem_d *nmd, int create, struct netmap_bdg_ops *ops)
388 {
389 	char *nr_name = hdr->nr_name;
390 	const char *ifname;
391 	struct ifnet *ifp = NULL;
392 	int error = 0;
393 	struct netmap_vp_adapter *vpna, *hostna = NULL;
394 	struct nm_bridge *b;
395 	uint32_t i, j;
396 	uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
397 	int needed;
398 
399 	*na = NULL;     /* default return value */
400 
401 	/* first try to see if this is a bridge port. */
402 	NMG_LOCK_ASSERT();
403 	if (strncmp(nr_name, ops->name, strlen(ops->name) - 1)) {
404 		return 0;  /* no error, but no VALE prefix */
405 	}
406 
407 	b = nm_find_bridge(nr_name, create, ops);
408 	if (b == NULL) {
409 		nm_prdis("no bridges available for '%s'", nr_name);
410 		return (create ? ENOMEM : ENXIO);
411 	}
412 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
413 		panic("x");
414 
415 	/* Now we are sure that name starts with the bridge's name,
416 	 * lookup the port in the bridge. We need to scan the entire
417 	 * list. It is not important to hold a WLOCK on the bridge
418 	 * during the search because NMG_LOCK already guarantees
419 	 * that there are no other possible writers.
420 	 */
421 
422 	/* lookup in the local list of ports */
423 	for (j = 0; j < b->bdg_active_ports; j++) {
424 		i = b->bdg_port_index[j];
425 		vpna = b->bdg_ports[i];
426 		nm_prdis("checking %s", vpna->up.name);
427 		if (!strcmp(vpna->up.name, nr_name)) {
428 			netmap_adapter_get(&vpna->up);
429 			nm_prdis("found existing if %s refs %d", nr_name)
430 			*na = &vpna->up;
431 			return 0;
432 		}
433 	}
434 	/* not found, should we create it? */
435 	if (!create)
436 		return ENXIO;
437 	/* yes we should, see if we have space to attach entries */
438 	needed = 2; /* in some cases we only need 1 */
439 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
440 		nm_prerr("bridge full %d, cannot create new port", b->bdg_active_ports);
441 		return ENOMEM;
442 	}
443 	/* record the next two ports available, but do not allocate yet */
444 	cand = b->bdg_port_index[b->bdg_active_ports];
445 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
446 	nm_prdis("+++ bridge %s port %s used %d avail %d %d",
447 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
448 
449 	/*
450 	 * try see if there is a matching NIC with this name
451 	 * (after the bridge's name)
452 	 */
453 	ifname = nr_name + b->bdg_namelen + 1;
454 	ifp = ifunit_ref(ifname);
455 	if (!ifp) {
456 		/* Create an ephemeral virtual port.
457 		 * This block contains all the ephemeral-specific logic.
458 		 */
459 
460 		if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
461 			error = EINVAL;
462 			goto out;
463 		}
464 
465 		/* bdg_netmap_attach creates a struct netmap_adapter */
466 		error = b->bdg_ops.vp_create(hdr, NULL, nmd, &vpna);
467 		if (error) {
468 			if (netmap_debug & NM_DEBUG_BDG)
469 				nm_prerr("error %d", error);
470 			goto out;
471 		}
472 		/* shortcut - we can skip get_hw_na(),
473 		 * ownership check and nm_bdg_attach()
474 		 */
475 
476 	} else {
477 		struct netmap_adapter *hw;
478 
479 		/* the vale:nic syntax is only valid for some commands */
480 		switch (hdr->nr_reqtype) {
481 		case NETMAP_REQ_VALE_ATTACH:
482 		case NETMAP_REQ_VALE_DETACH:
483 		case NETMAP_REQ_VALE_POLLING_ENABLE:
484 		case NETMAP_REQ_VALE_POLLING_DISABLE:
485 			break; /* ok */
486 		default:
487 			error = EINVAL;
488 			goto out;
489 		}
490 
491 		error = netmap_get_hw_na(ifp, nmd, &hw);
492 		if (error || hw == NULL)
493 			goto out;
494 
495 		/* host adapter might not be created */
496 		error = hw->nm_bdg_attach(nr_name, hw, b);
497 		if (error == NM_NEED_BWRAP) {
498 			error = b->bdg_ops.bwrap_attach(nr_name, hw);
499 		}
500 		if (error)
501 			goto out;
502 		vpna = hw->na_vp;
503 		hostna = hw->na_hostvp;
504 		if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
505 			/* Check if we need to skip the host rings. */
506 			struct nmreq_vale_attach *areq =
507 				(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
508 			if (areq->reg.nr_mode != NR_REG_NIC_SW) {
509 				hostna = NULL;
510 			}
511 		}
512 	}
513 
514 	BDG_WLOCK(b);
515 	vpna->bdg_port = cand;
516 	nm_prdis("NIC  %p to bridge port %d", vpna, cand);
517 	/* bind the port to the bridge (virtual ports are not active) */
518 	b->bdg_ports[cand] = vpna;
519 	vpna->na_bdg = b;
520 	b->bdg_active_ports++;
521 	if (hostna != NULL) {
522 		/* also bind the host stack to the bridge */
523 		b->bdg_ports[cand2] = hostna;
524 		hostna->bdg_port = cand2;
525 		hostna->na_bdg = b;
526 		b->bdg_active_ports++;
527 		nm_prdis("host %p to bridge port %d", hostna, cand2);
528 	}
529 	nm_prdis("if %s refs %d", ifname, vpna->up.na_refcount);
530 	BDG_WUNLOCK(b);
531 	*na = &vpna->up;
532 	netmap_adapter_get(*na);
533 
534 out:
535 	if (ifp)
536 		if_rele(ifp);
537 
538 	return error;
539 }
540 
541 
542 int
nm_is_bwrap(struct netmap_adapter * na)543 nm_is_bwrap(struct netmap_adapter *na)
544 {
545 	return na->nm_register == netmap_bwrap_reg;
546 }
547 
548 
549 struct nm_bdg_polling_state;
550 struct
551 nm_bdg_kthread {
552 	struct nm_kctx *nmk;
553 	u_int qfirst;
554 	u_int qlast;
555 	struct nm_bdg_polling_state *bps;
556 };
557 
558 struct nm_bdg_polling_state {
559 	bool configured;
560 	bool stopped;
561 	struct netmap_bwrap_adapter *bna;
562 	uint32_t mode;
563 	u_int qfirst;
564 	u_int qlast;
565 	u_int cpu_from;
566 	u_int ncpus;
567 	struct nm_bdg_kthread *kthreads;
568 };
569 
570 static void
netmap_bwrap_polling(void * data)571 netmap_bwrap_polling(void *data)
572 {
573 	struct nm_bdg_kthread *nbk = data;
574 	struct netmap_bwrap_adapter *bna;
575 	u_int qfirst, qlast, i;
576 	struct netmap_kring **kring0, *kring;
577 
578 	if (!nbk)
579 		return;
580 	qfirst = nbk->qfirst;
581 	qlast = nbk->qlast;
582 	bna = nbk->bps->bna;
583 	kring0 = NMR(bna->hwna, NR_RX);
584 
585 	for (i = qfirst; i < qlast; i++) {
586 		kring = kring0[i];
587 		kring->nm_notify(kring, 0);
588 	}
589 }
590 
591 static int
nm_bdg_create_kthreads(struct nm_bdg_polling_state * bps)592 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
593 {
594 	struct nm_kctx_cfg kcfg;
595 	int i, j;
596 
597 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
598 	if (bps->kthreads == NULL)
599 		return ENOMEM;
600 
601 	bzero(&kcfg, sizeof(kcfg));
602 	kcfg.worker_fn = netmap_bwrap_polling;
603 	for (i = 0; i < bps->ncpus; i++) {
604 		struct nm_bdg_kthread *t = bps->kthreads + i;
605 		int all = (bps->ncpus == 1 &&
606 			bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
607 		int affinity = bps->cpu_from + i;
608 
609 		t->bps = bps;
610 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
611 		t->qlast = all ? bps->qlast : t->qfirst + 1;
612 		if (netmap_verbose)
613 			nm_prinf("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
614 				t->qlast);
615 
616 		kcfg.type = i;
617 		kcfg.worker_private = t;
618 		t->nmk = nm_os_kctx_create(&kcfg, NULL);
619 		if (t->nmk == NULL) {
620 			goto cleanup;
621 		}
622 		nm_os_kctx_worker_setaff(t->nmk, affinity);
623 	}
624 	return 0;
625 
626 cleanup:
627 	for (j = 0; j < i; j++) {
628 		struct nm_bdg_kthread *t = bps->kthreads + i;
629 		nm_os_kctx_destroy(t->nmk);
630 	}
631 	nm_os_free(bps->kthreads);
632 	return EFAULT;
633 }
634 
635 /* A variant of ptnetmap_start_kthreads() */
636 static int
nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state * bps)637 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
638 {
639 	int error, i, j;
640 
641 	if (!bps) {
642 		nm_prerr("polling is not configured");
643 		return EFAULT;
644 	}
645 	bps->stopped = false;
646 
647 	for (i = 0; i < bps->ncpus; i++) {
648 		struct nm_bdg_kthread *t = bps->kthreads + i;
649 		error = nm_os_kctx_worker_start(t->nmk);
650 		if (error) {
651 			nm_prerr("error in nm_kthread_start(): %d", error);
652 			goto cleanup;
653 		}
654 	}
655 	return 0;
656 
657 cleanup:
658 	for (j = 0; j < i; j++) {
659 		struct nm_bdg_kthread *t = bps->kthreads + i;
660 		nm_os_kctx_worker_stop(t->nmk);
661 	}
662 	bps->stopped = true;
663 	return error;
664 }
665 
666 static void
nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state * bps)667 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
668 {
669 	int i;
670 
671 	if (!bps)
672 		return;
673 
674 	for (i = 0; i < bps->ncpus; i++) {
675 		struct nm_bdg_kthread *t = bps->kthreads + i;
676 		nm_os_kctx_worker_stop(t->nmk);
677 		nm_os_kctx_destroy(t->nmk);
678 	}
679 	bps->stopped = true;
680 }
681 
682 static int
get_polling_cfg(struct nmreq_vale_polling * req,struct netmap_adapter * na,struct nm_bdg_polling_state * bps)683 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
684 		struct nm_bdg_polling_state *bps)
685 {
686 	unsigned int avail_cpus, core_from;
687 	unsigned int qfirst, qlast;
688 	uint32_t i = req->nr_first_cpu_id;
689 	uint32_t req_cpus = req->nr_num_polling_cpus;
690 
691 	avail_cpus = nm_os_ncpus();
692 
693 	if (req_cpus == 0) {
694 		nm_prerr("req_cpus must be > 0");
695 		return EINVAL;
696 	} else if (req_cpus >= avail_cpus) {
697 		nm_prerr("Cannot use all the CPUs in the system");
698 		return EINVAL;
699 	}
700 
701 	if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
702 		/* Use a separate core for each ring. If nr_num_polling_cpus>1
703 		 * more consecutive rings are polled.
704 		 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
705 		 * ring 2 and 3 are polled by core 2 and 3, respectively. */
706 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
707 			nm_prerr("Rings %u-%u not in range (have %d rings)",
708 				i, i + req_cpus, nma_get_nrings(na, NR_RX));
709 			return EINVAL;
710 		}
711 		qfirst = i;
712 		qlast = qfirst + req_cpus;
713 		core_from = qfirst;
714 
715 	} else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
716 		/* Poll all the rings using a core specified by nr_first_cpu_id.
717 		 * the number of cores must be 1. */
718 		if (req_cpus != 1) {
719 			nm_prerr("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
720 				"(was %d)", req_cpus);
721 			return EINVAL;
722 		}
723 		qfirst = 0;
724 		qlast = nma_get_nrings(na, NR_RX);
725 		core_from = i;
726 	} else {
727 		nm_prerr("Invalid polling mode");
728 		return EINVAL;
729 	}
730 
731 	bps->mode = req->nr_mode;
732 	bps->qfirst = qfirst;
733 	bps->qlast = qlast;
734 	bps->cpu_from = core_from;
735 	bps->ncpus = req_cpus;
736 	nm_prinf("%s qfirst %u qlast %u cpu_from %u ncpus %u",
737 		req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
738 		"MULTI" : "SINGLE",
739 		qfirst, qlast, core_from, req_cpus);
740 	return 0;
741 }
742 
743 static int
nm_bdg_ctl_polling_start(struct nmreq_vale_polling * req,struct netmap_adapter * na)744 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
745 {
746 	struct nm_bdg_polling_state *bps;
747 	struct netmap_bwrap_adapter *bna;
748 	int error;
749 
750 	bna = (struct netmap_bwrap_adapter *)na;
751 	if (bna->na_polling_state) {
752 		nm_prerr("ERROR adapter already in polling mode");
753 		return EFAULT;
754 	}
755 
756 	bps = nm_os_malloc(sizeof(*bps));
757 	if (!bps)
758 		return ENOMEM;
759 	bps->configured = false;
760 	bps->stopped = true;
761 
762 	if (get_polling_cfg(req, na, bps)) {
763 		nm_os_free(bps);
764 		return EINVAL;
765 	}
766 
767 	if (nm_bdg_create_kthreads(bps)) {
768 		nm_os_free(bps);
769 		return EFAULT;
770 	}
771 
772 	bps->configured = true;
773 	bna->na_polling_state = bps;
774 	bps->bna = bna;
775 
776 	/* disable interrupts if possible */
777 	nma_intr_enable(bna->hwna, 0);
778 	/* start kthread now */
779 	error = nm_bdg_polling_start_kthreads(bps);
780 	if (error) {
781 		nm_prerr("ERROR nm_bdg_polling_start_kthread()");
782 		nm_os_free(bps->kthreads);
783 		nm_os_free(bps);
784 		bna->na_polling_state = NULL;
785 		nma_intr_enable(bna->hwna, 1);
786 	}
787 	return error;
788 }
789 
790 static int
nm_bdg_ctl_polling_stop(struct netmap_adapter * na)791 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
792 {
793 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
794 	struct nm_bdg_polling_state *bps;
795 
796 	if (!bna->na_polling_state) {
797 		nm_prerr("ERROR adapter is not in polling mode");
798 		return EFAULT;
799 	}
800 	bps = bna->na_polling_state;
801 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
802 	bps->configured = false;
803 	nm_os_free(bps);
804 	bna->na_polling_state = NULL;
805 	/* re-enable interrupts */
806 	nma_intr_enable(bna->hwna, 1);
807 	return 0;
808 }
809 
810 int
nm_bdg_polling(struct nmreq_header * hdr)811 nm_bdg_polling(struct nmreq_header *hdr)
812 {
813 	struct nmreq_vale_polling *req =
814 		(struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
815 	struct netmap_adapter *na = NULL;
816 	int error = 0;
817 
818 	NMG_LOCK();
819 	error = netmap_get_vale_na(hdr, &na, NULL, /*create=*/0);
820 	if (na && !error) {
821 		if (!nm_is_bwrap(na)) {
822 			error = EOPNOTSUPP;
823 		} else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
824 			error = nm_bdg_ctl_polling_start(req, na);
825 			if (!error)
826 				netmap_adapter_get(na);
827 		} else {
828 			error = nm_bdg_ctl_polling_stop(na);
829 			if (!error)
830 				netmap_adapter_put(na);
831 		}
832 		netmap_adapter_put(na);
833 	} else if (!na && !error) {
834 		/* Not VALE port. */
835 		error = EINVAL;
836 	}
837 	NMG_UNLOCK();
838 
839 	return error;
840 }
841 
842 /* Called by external kernel modules (e.g., Openvswitch).
843  * to set configure/lookup/dtor functions of a VALE instance.
844  * Register callbacks to the given bridge. 'name' may be just
845  * bridge's name (including ':' if it is not just NM_BDG_NAME).
846  *
847  * Called without NMG_LOCK.
848  */
849 
850 int
netmap_bdg_regops(const char * name,struct netmap_bdg_ops * bdg_ops,void * private_data,void * auth_token)851 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
852 {
853 	struct nm_bridge *b;
854 	int error = 0;
855 
856 	NMG_LOCK();
857 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
858 	if (!b) {
859 		error = ENXIO;
860 		goto unlock_regops;
861 	}
862 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
863 		error = EACCES;
864 		goto unlock_regops;
865 	}
866 
867 	BDG_WLOCK(b);
868 	if (!bdg_ops) {
869 		/* resetting the bridge */
870 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
871 		b->bdg_ops = b->bdg_saved_ops;
872 		b->private_data = b->ht;
873 	} else {
874 		/* modifying the bridge */
875 		b->private_data = private_data;
876 #define nm_bdg_override(m) if (bdg_ops->m) b->bdg_ops.m = bdg_ops->m
877 		nm_bdg_override(lookup);
878 		nm_bdg_override(config);
879 		nm_bdg_override(dtor);
880 		nm_bdg_override(vp_create);
881 		nm_bdg_override(bwrap_attach);
882 #undef nm_bdg_override
883 
884 	}
885 	BDG_WUNLOCK(b);
886 
887 unlock_regops:
888 	NMG_UNLOCK();
889 	return error;
890 }
891 
892 
893 int
netmap_bdg_config(struct nm_ifreq * nr)894 netmap_bdg_config(struct nm_ifreq *nr)
895 {
896 	struct nm_bridge *b;
897 	int error = EINVAL;
898 
899 	NMG_LOCK();
900 	b = nm_find_bridge(nr->nifr_name, 0, NULL);
901 	if (!b) {
902 		NMG_UNLOCK();
903 		return error;
904 	}
905 	NMG_UNLOCK();
906 	/* Don't call config() with NMG_LOCK() held */
907 	BDG_RLOCK(b);
908 	if (b->bdg_ops.config != NULL)
909 		error = b->bdg_ops.config(nr);
910 	BDG_RUNLOCK(b);
911 	return error;
912 }
913 
914 
915 /* nm_register callback for VALE ports */
916 int
netmap_vp_reg(struct netmap_adapter * na,int onoff)917 netmap_vp_reg(struct netmap_adapter *na, int onoff)
918 {
919 	struct netmap_vp_adapter *vpna =
920 		(struct netmap_vp_adapter*)na;
921 
922 	/* persistent ports may be put in netmap mode
923 	 * before being attached to a bridge
924 	 */
925 	if (vpna->na_bdg)
926 		BDG_WLOCK(vpna->na_bdg);
927 	if (onoff) {
928 		netmap_krings_mode_commit(na, onoff);
929 		if (na->active_fds == 0)
930 			na->na_flags |= NAF_NETMAP_ON;
931 		 /* XXX on FreeBSD, persistent VALE ports should also
932 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
933 		 */
934 	} else {
935 		if (na->active_fds == 0)
936 			na->na_flags &= ~NAF_NETMAP_ON;
937 		netmap_krings_mode_commit(na, onoff);
938 	}
939 	if (vpna->na_bdg)
940 		BDG_WUNLOCK(vpna->na_bdg);
941 	return 0;
942 }
943 
944 
945 /* rxsync code used by VALE ports nm_rxsync callback and also
946  * internally by the brwap
947  */
948 static int
netmap_vp_rxsync_locked(struct netmap_kring * kring,int flags)949 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
950 {
951 	struct netmap_adapter *na = kring->na;
952 	struct netmap_ring *ring = kring->ring;
953 	u_int nm_i, lim = kring->nkr_num_slots - 1;
954 	u_int head = kring->rhead;
955 	int n;
956 
957 	if (head > lim) {
958 		nm_prerr("ouch dangerous reset!!!");
959 		n = netmap_ring_reinit(kring);
960 		goto done;
961 	}
962 
963 	/* First part, import newly received packets. */
964 	/* actually nothing to do here, they are already in the kring */
965 
966 	/* Second part, skip past packets that userspace has released. */
967 	nm_i = kring->nr_hwcur;
968 	if (nm_i != head) {
969 		/* consistency check, but nothing really important here */
970 		for (n = 0; likely(nm_i != head); n++) {
971 			struct netmap_slot *slot = &ring->slot[nm_i];
972 			void *addr = NMB(na, slot);
973 
974 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
975 				nm_prerr("bad buffer index %d, ignore ?",
976 					slot->buf_idx);
977 			}
978 			slot->flags &= ~NS_BUF_CHANGED;
979 			nm_i = nm_next(nm_i, lim);
980 		}
981 		kring->nr_hwcur = head;
982 	}
983 
984 	n = 0;
985 done:
986 	return n;
987 }
988 
989 /*
990  * nm_rxsync callback for VALE ports
991  * user process reading from a VALE switch.
992  * Already protected against concurrent calls from userspace,
993  * but we must acquire the queue's lock to protect against
994  * writers on the same queue.
995  */
996 int
netmap_vp_rxsync(struct netmap_kring * kring,int flags)997 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
998 {
999 	int n;
1000 
1001 	mtx_lock(&kring->q_lock);
1002 	n = netmap_vp_rxsync_locked(kring, flags);
1003 	mtx_unlock(&kring->q_lock);
1004 	return n;
1005 }
1006 
1007 int
netmap_bwrap_attach(const char * nr_name,struct netmap_adapter * hwna,struct netmap_bdg_ops * ops)1008 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna,
1009 		struct netmap_bdg_ops *ops)
1010 {
1011 	return ops->bwrap_attach(nr_name, hwna);
1012 }
1013 
1014 
1015 /* Bridge wrapper code (bwrap).
1016  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1017  * VALE switch.
1018  * The main task is to swap the meaning of tx and rx rings to match the
1019  * expectations of the VALE switch code (see nm_bdg_flush).
1020  *
1021  * The bwrap works by interposing a netmap_bwrap_adapter between the
1022  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1023  * a netmap_vp_adapter to the rest the system, but, internally, it
1024  * translates all callbacks to what the hwna expects.
1025  *
1026  * Note that we have to intercept callbacks coming from two sides:
1027  *
1028  *  - callbacks coming from the netmap module are intercepted by
1029  *    passing around the netmap_bwrap_adapter instead of the hwna
1030  *
1031  *  - callbacks coming from outside of the netmap module only know
1032  *    about the hwna. This, however, only happens in interrupt
1033  *    handlers, where only the hwna->nm_notify callback is called.
1034  *    What the bwrap does is to overwrite the hwna->nm_notify callback
1035  *    with its own netmap_bwrap_intr_notify.
1036  *    XXX This assumes that the hwna->nm_notify callback was the
1037  *    standard netmap_notify(), as it is the case for nic adapters.
1038  *    Any additional action performed by hwna->nm_notify will not be
1039  *    performed by netmap_bwrap_intr_notify.
1040  *
1041  * Additionally, the bwrap can optionally attach the host rings pair
1042  * of the wrapped adapter to a different port of the switch.
1043  */
1044 
1045 
1046 static void
netmap_bwrap_dtor(struct netmap_adapter * na)1047 netmap_bwrap_dtor(struct netmap_adapter *na)
1048 {
1049 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1050 	struct netmap_adapter *hwna = bna->hwna;
1051 	struct nm_bridge *b = bna->up.na_bdg,
1052 		*bh = bna->host.na_bdg;
1053 
1054 	if (bna->host.up.nm_mem)
1055 		netmap_mem_put(bna->host.up.nm_mem);
1056 
1057 	if (b) {
1058 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1059 			    (bh ? bna->host.bdg_port : -1));
1060 	}
1061 
1062 	nm_prdis("na %p", na);
1063 	na->ifp = NULL;
1064 	bna->host.up.ifp = NULL;
1065 	hwna->na_vp = bna->saved_na_vp;
1066 	hwna->na_hostvp = NULL;
1067 	hwna->na_private = NULL;
1068 	hwna->na_flags &= ~NAF_BUSY;
1069 	netmap_adapter_put(hwna);
1070 
1071 }
1072 
1073 
1074 /*
1075  * Intr callback for NICs connected to a bridge.
1076  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1077  * and pass received packets from nic to the bridge.
1078  *
1079  * XXX TODO check locking: this is called from the interrupt
1080  * handler so we should make sure that the interface is not
1081  * disconnected while passing down an interrupt.
1082  *
1083  * Note, no user process can access this NIC or the host stack.
1084  * The only part of the ring that is significant are the slots,
1085  * and head/cur/tail are set from the kring as needed
1086  * (part as a receive ring, part as a transmit ring).
1087  *
1088  * callback that overwrites the hwna notify callback.
1089  * Packets come from the outside or from the host stack and are put on an
1090  * hwna rx ring.
1091  * The bridge wrapper then sends the packets through the bridge.
1092  */
1093 static int
netmap_bwrap_intr_notify(struct netmap_kring * kring,int flags)1094 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1095 {
1096 	struct netmap_adapter *na = kring->na;
1097 	struct netmap_bwrap_adapter *bna = na->na_private;
1098 	struct netmap_kring *bkring;
1099 	struct netmap_vp_adapter *vpna = &bna->up;
1100 	u_int ring_nr = kring->ring_id;
1101 	int ret = NM_IRQ_COMPLETED;
1102 	int error;
1103 
1104 	if (netmap_debug & NM_DEBUG_RXINTR)
1105 	    nm_prinf("%s %s 0x%x", na->name, kring->name, flags);
1106 
1107 	bkring = vpna->up.tx_rings[ring_nr];
1108 
1109 	/* make sure the ring is not disabled */
1110 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
1111 		return EIO;
1112 	}
1113 
1114 	if (netmap_debug & NM_DEBUG_RXINTR)
1115 	    nm_prinf("%s head %d cur %d tail %d",  na->name,
1116 		kring->rhead, kring->rcur, kring->rtail);
1117 
1118 	/* simulate a user wakeup on the rx ring
1119 	 * fetch packets that have arrived.
1120 	 */
1121 	error = kring->nm_sync(kring, 0);
1122 	if (error)
1123 		goto put_out;
1124 	if (kring->nr_hwcur == kring->nr_hwtail) {
1125 		if (netmap_verbose)
1126 			nm_prlim(1, "interrupt with no packets on %s",
1127 				kring->name);
1128 		goto put_out;
1129 	}
1130 
1131 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1132 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1133 	 * to push all packets out.
1134 	 */
1135 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
1136 
1137 	bkring->nm_sync(bkring, flags);
1138 
1139 	/* mark all buffers as released on this ring */
1140 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1141 	/* another call to actually release the buffers */
1142 	error = kring->nm_sync(kring, 0);
1143 
1144 	/* The second rxsync may have further advanced hwtail. If this happens,
1145 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
1146 	if (kring->rcur != kring->nr_hwtail) {
1147 		ret = NM_IRQ_RESCHED;
1148 	}
1149 put_out:
1150 	nm_kr_put(kring);
1151 
1152 	return error ? error : ret;
1153 }
1154 
1155 
1156 /* nm_register callback for bwrap */
1157 int
netmap_bwrap_reg(struct netmap_adapter * na,int onoff)1158 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
1159 {
1160 	struct netmap_bwrap_adapter *bna =
1161 		(struct netmap_bwrap_adapter *)na;
1162 	struct netmap_adapter *hwna = bna->hwna;
1163 	struct netmap_vp_adapter *hostna = &bna->host;
1164 	int error, i;
1165 	enum txrx t;
1166 
1167 	nm_prdis("%s %s", na->name, onoff ? "on" : "off");
1168 
1169 	if (onoff) {
1170 		/* netmap_do_regif has been called on the bwrap na.
1171 		 * We need to pass the information about the
1172 		 * memory allocator down to the hwna before
1173 		 * putting it in netmap mode
1174 		 */
1175 		hwna->na_lut = na->na_lut;
1176 
1177 		if (hostna->na_bdg) {
1178 			/* if the host rings have been attached to switch,
1179 			 * we need to copy the memory allocator information
1180 			 * in the hostna also
1181 			 */
1182 			hostna->up.na_lut = na->na_lut;
1183 		}
1184 
1185 	}
1186 
1187 	/* pass down the pending ring state information */
1188 	for_rx_tx(t) {
1189 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1190 			NMR(hwna, nm_txrx_swap(t))[i]->nr_pending_mode =
1191 				NMR(na, t)[i]->nr_pending_mode;
1192 		}
1193 	}
1194 
1195 	/* forward the request to the hwna */
1196 	error = hwna->nm_register(hwna, onoff);
1197 	if (error)
1198 		return error;
1199 
1200 	/* copy up the current ring state information */
1201 	for_rx_tx(t) {
1202 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1203 			struct netmap_kring *kring = NMR(hwna, nm_txrx_swap(t))[i];
1204 			NMR(na, t)[i]->nr_mode = kring->nr_mode;
1205 		}
1206 	}
1207 
1208 	/* impersonate a netmap_vp_adapter */
1209 	netmap_vp_reg(na, onoff);
1210 	if (hostna->na_bdg)
1211 		netmap_vp_reg(&hostna->up, onoff);
1212 
1213 	if (onoff) {
1214 		u_int i;
1215 		/* intercept the hwna nm_nofify callback on the hw rings */
1216 		for (i = 0; i < hwna->num_rx_rings; i++) {
1217 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
1218 			hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
1219 		}
1220 		i = hwna->num_rx_rings; /* for safety */
1221 		/* save the host ring notify unconditionally */
1222 		for (; i < netmap_real_rings(hwna, NR_RX); i++) {
1223 			hwna->rx_rings[i]->save_notify =
1224 				hwna->rx_rings[i]->nm_notify;
1225 			if (hostna->na_bdg) {
1226 				/* also intercept the host ring notify */
1227 				hwna->rx_rings[i]->nm_notify =
1228 					netmap_bwrap_intr_notify;
1229 				na->tx_rings[i]->nm_sync = na->nm_txsync;
1230 			}
1231 		}
1232 		if (na->active_fds == 0)
1233 			na->na_flags |= NAF_NETMAP_ON;
1234 	} else {
1235 		u_int i;
1236 
1237 		if (na->active_fds == 0)
1238 			na->na_flags &= ~NAF_NETMAP_ON;
1239 
1240 		/* reset all notify callbacks (including host ring) */
1241 		for (i = 0; i < netmap_all_rings(hwna, NR_RX); i++) {
1242 			hwna->rx_rings[i]->nm_notify =
1243 				hwna->rx_rings[i]->save_notify;
1244 			hwna->rx_rings[i]->save_notify = NULL;
1245 		}
1246 		hwna->na_lut.lut = NULL;
1247 		hwna->na_lut.plut = NULL;
1248 		hwna->na_lut.objtotal = 0;
1249 		hwna->na_lut.objsize = 0;
1250 
1251 		/* pass ownership of the netmap rings to the hwna */
1252 		for_rx_tx(t) {
1253 			for (i = 0; i < netmap_all_rings(na, t); i++) {
1254 				NMR(na, t)[i]->ring = NULL;
1255 			}
1256 		}
1257 		/* reset the number of host rings to default */
1258 		for_rx_tx(t) {
1259 			nma_set_host_nrings(hwna, t, 1);
1260 		}
1261 
1262 	}
1263 
1264 	return 0;
1265 }
1266 
1267 /* nm_config callback for bwrap */
1268 static int
netmap_bwrap_config(struct netmap_adapter * na,struct nm_config_info * info)1269 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
1270 {
1271 	struct netmap_bwrap_adapter *bna =
1272 		(struct netmap_bwrap_adapter *)na;
1273 	struct netmap_adapter *hwna = bna->hwna;
1274 	int error;
1275 
1276 	/* Forward the request to the hwna. It may happen that nobody
1277 	 * registered hwna yet, so netmap_mem_get_lut() may have not
1278 	 * been called yet. */
1279 	error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
1280 	if (error)
1281 		return error;
1282 	netmap_update_config(hwna);
1283 	/* swap the results and propagate */
1284 	info->num_tx_rings = hwna->num_rx_rings;
1285 	info->num_tx_descs = hwna->num_rx_desc;
1286 	info->num_rx_rings = hwna->num_tx_rings;
1287 	info->num_rx_descs = hwna->num_tx_desc;
1288 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
1289 
1290 	return 0;
1291 }
1292 
1293 
1294 /* nm_krings_create callback for bwrap */
1295 int
netmap_bwrap_krings_create_common(struct netmap_adapter * na)1296 netmap_bwrap_krings_create_common(struct netmap_adapter *na)
1297 {
1298 	struct netmap_bwrap_adapter *bna =
1299 		(struct netmap_bwrap_adapter *)na;
1300 	struct netmap_adapter *hwna = bna->hwna;
1301 	struct netmap_adapter *hostna = &bna->host.up;
1302 	int i, error = 0;
1303 	enum txrx t;
1304 
1305 	/* also create the hwna krings */
1306 	error = hwna->nm_krings_create(hwna);
1307 	if (error) {
1308 		return error;
1309 	}
1310 
1311 	/* increment the usage counter for all the hwna krings */
1312 	for_rx_tx(t) {
1313 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1314 			NMR(hwna, t)[i]->users++;
1315 		}
1316 	}
1317 
1318 	/* now create the actual rings */
1319 	error = netmap_mem_rings_create(hwna);
1320 	if (error) {
1321 		goto err_dec_users;
1322 	}
1323 
1324 	/* cross-link the netmap rings
1325 	 * The original number of rings comes from hwna,
1326 	 * rx rings on one side equals tx rings on the other.
1327 	 */
1328 	for_rx_tx(t) {
1329 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1330 		for (i = 0; i < netmap_all_rings(hwna, r); i++) {
1331 			NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
1332 			NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
1333 		}
1334 	}
1335 
1336 	if (na->na_flags & NAF_HOST_RINGS) {
1337 		/* the hostna rings are the host rings of the bwrap.
1338 		 * The corresponding krings must point back to the
1339 		 * hostna
1340 		 */
1341 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
1342 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
1343 		for_rx_tx(t) {
1344 			for (i = 0; i < nma_get_nrings(hostna, t); i++) {
1345 				NMR(hostna, t)[i]->na = hostna;
1346 			}
1347 		}
1348 	}
1349 
1350 	return 0;
1351 
1352 err_dec_users:
1353 	for_rx_tx(t) {
1354 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1355 			NMR(hwna, t)[i]->users--;
1356 		}
1357 	}
1358 	hwna->nm_krings_delete(hwna);
1359 	return error;
1360 }
1361 
1362 
1363 void
netmap_bwrap_krings_delete_common(struct netmap_adapter * na)1364 netmap_bwrap_krings_delete_common(struct netmap_adapter *na)
1365 {
1366 	struct netmap_bwrap_adapter *bna =
1367 		(struct netmap_bwrap_adapter *)na;
1368 	struct netmap_adapter *hwna = bna->hwna;
1369 	enum txrx t;
1370 	int i;
1371 
1372 	nm_prdis("%s", na->name);
1373 
1374 	/* decrement the usage counter for all the hwna krings */
1375 	for_rx_tx(t) {
1376 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1377 			NMR(hwna, t)[i]->users--;
1378 		}
1379 	}
1380 
1381 	/* delete any netmap rings that are no longer needed */
1382 	netmap_mem_rings_delete(hwna);
1383 	hwna->nm_krings_delete(hwna);
1384 }
1385 
1386 
1387 /* notify method for the bridge-->hwna direction */
1388 int
netmap_bwrap_notify(struct netmap_kring * kring,int flags)1389 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
1390 {
1391 	struct netmap_adapter *na = kring->na;
1392 	struct netmap_bwrap_adapter *bna = na->na_private;
1393 	struct netmap_adapter *hwna = bna->hwna;
1394 	u_int ring_n = kring->ring_id;
1395 	u_int lim = kring->nkr_num_slots - 1;
1396 	struct netmap_kring *hw_kring;
1397 	int error;
1398 
1399 	nm_prdis("%s: na %s hwna %s",
1400 			(kring ? kring->name : "NULL!"),
1401 			(na ? na->name : "NULL!"),
1402 			(hwna ? hwna->name : "NULL!"));
1403 	hw_kring = hwna->tx_rings[ring_n];
1404 
1405 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
1406 		return ENXIO;
1407 	}
1408 
1409 	/* first step: simulate a user wakeup on the rx ring */
1410 	netmap_vp_rxsync(kring, flags);
1411 	nm_prdis("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1412 		na->name, ring_n,
1413 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1414 		kring->rhead, kring->rcur, kring->rtail,
1415 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1416 	/* second step: the new packets are sent on the tx ring
1417 	 * (which is actually the same ring)
1418 	 */
1419 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
1420 	error = hw_kring->nm_sync(hw_kring, flags);
1421 	if (error)
1422 		goto put_out;
1423 
1424 	/* third step: now we are back the rx ring */
1425 	/* claim ownership on all hw owned bufs */
1426 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
1427 
1428 	/* fourth step: the user goes to sleep again, causing another rxsync */
1429 	netmap_vp_rxsync(kring, flags);
1430 	nm_prdis("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1431 		na->name, ring_n,
1432 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1433 		kring->rhead, kring->rcur, kring->rtail,
1434 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1435 put_out:
1436 	nm_kr_put(hw_kring);
1437 
1438 	return error ? error : NM_IRQ_COMPLETED;
1439 }
1440 
1441 
1442 /* nm_bdg_ctl callback for the bwrap.
1443  * Called on bridge-attach and detach, as an effect of valectl -[ahd].
1444  * On attach, it needs to provide a fake netmap_priv_d structure and
1445  * perform a netmap_do_regif() on the bwrap. This will put both the
1446  * bwrap and the hwna in netmap mode, with the netmap rings shared
1447  * and cross linked. Moroever, it will start intercepting interrupts
1448  * directed to hwna.
1449  */
1450 static int
netmap_bwrap_bdg_ctl(struct nmreq_header * hdr,struct netmap_adapter * na)1451 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
1452 {
1453 	struct netmap_priv_d *npriv;
1454 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1455 	int error = 0;
1456 
1457 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
1458 		struct nmreq_vale_attach *req =
1459 			(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1460 		if (req->reg.nr_ringid != 0 ||
1461 			(req->reg.nr_mode != NR_REG_ALL_NIC &&
1462 				req->reg.nr_mode != NR_REG_NIC_SW)) {
1463 			/* We only support attaching all the NIC rings
1464 			 * and/or the host stack. */
1465 			return EINVAL;
1466 		}
1467 		if (NETMAP_OWNED_BY_ANY(na)) {
1468 			return EBUSY;
1469 		}
1470 		if (bna->na_kpriv) {
1471 			/* nothing to do */
1472 			return 0;
1473 		}
1474 		npriv = netmap_priv_new();
1475 		if (npriv == NULL)
1476 			return ENOMEM;
1477 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
1478 		error = netmap_do_regif(npriv, na, hdr);
1479 		if (error) {
1480 			netmap_priv_delete(npriv);
1481 			return error;
1482 		}
1483 		bna->na_kpriv = npriv;
1484 		na->na_flags |= NAF_BUSY;
1485 	} else {
1486 		if (na->active_fds == 0) /* not registered */
1487 			return EINVAL;
1488 		netmap_priv_delete(bna->na_kpriv);
1489 		bna->na_kpriv = NULL;
1490 		na->na_flags &= ~NAF_BUSY;
1491 	}
1492 
1493 	return error;
1494 }
1495 
1496 /* attach a bridge wrapper to the 'real' device */
1497 int
netmap_bwrap_attach_common(struct netmap_adapter * na,struct netmap_adapter * hwna)1498 netmap_bwrap_attach_common(struct netmap_adapter *na,
1499 		struct netmap_adapter *hwna)
1500 {
1501 	struct netmap_bwrap_adapter *bna;
1502 	struct netmap_adapter *hostna = NULL;
1503 	int error = 0;
1504 	enum txrx t;
1505 
1506 	/* make sure the NIC is not already in use */
1507 	if (NETMAP_OWNED_BY_ANY(hwna)) {
1508 		nm_prerr("NIC %s busy, cannot attach to bridge", hwna->name);
1509 		return EBUSY;
1510 	}
1511 
1512 	bna = (struct netmap_bwrap_adapter *)na;
1513 	/* make bwrap ifp point to the real ifp */
1514 	na->ifp = hwna->ifp;
1515 	if_ref(na->ifp);
1516 	na->na_private = bna;
1517 	/* fill the ring data for the bwrap adapter with rx/tx meanings
1518 	 * swapped. The real cross-linking will be done during register,
1519 	 * when all the krings will have been created.
1520 	 */
1521 	for_rx_tx(t) {
1522 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1523 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
1524 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
1525 	}
1526 	na->nm_dtor = netmap_bwrap_dtor;
1527 	na->nm_config = netmap_bwrap_config;
1528 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
1529 	na->pdev = hwna->pdev;
1530 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
1531 	na->virt_hdr_len = hwna->virt_hdr_len;
1532 	na->rx_buf_maxsize = hwna->rx_buf_maxsize;
1533 
1534 	bna->hwna = hwna;
1535 	netmap_adapter_get(hwna);
1536 	hwna->na_private = bna; /* weak reference */
1537 	bna->saved_na_vp = hwna->na_vp;
1538 	hwna->na_vp = &bna->up;
1539 	bna->up.up.na_vp = &(bna->up);
1540 
1541 	if (hwna->na_flags & NAF_HOST_RINGS) {
1542 		if (hwna->na_flags & NAF_SW_ONLY)
1543 			na->na_flags |= NAF_SW_ONLY;
1544 		na->na_flags |= NAF_HOST_RINGS;
1545 		hostna = &bna->host.up;
1546 
1547 		/* limit the number of host rings to that of hw */
1548 		nm_bound_var(&hostna->num_tx_rings, 1, 1,
1549 				nma_get_nrings(hwna, NR_TX), NULL);
1550 		nm_bound_var(&hostna->num_rx_rings, 1, 1,
1551 				nma_get_nrings(hwna, NR_RX), NULL);
1552 
1553 		snprintf(hostna->name, sizeof(hostna->name), "%s^", na->name);
1554 		hostna->ifp = hwna->ifp;
1555 		for_rx_tx(t) {
1556 			enum txrx r = nm_txrx_swap(t);
1557 			u_int nr = nma_get_nrings(hostna, t);
1558 
1559 			nma_set_nrings(hostna, t, nr);
1560 			nma_set_host_nrings(na, t, nr);
1561 			if (nma_get_host_nrings(hwna, t) < nr) {
1562 				nma_set_host_nrings(hwna, t, nr);
1563 			}
1564 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
1565 		}
1566 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
1567 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
1568 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
1569 		hostna->na_private = bna;
1570 		hostna->na_vp = &bna->up;
1571 		na->na_hostvp = hwna->na_hostvp =
1572 			hostna->na_hostvp = &bna->host;
1573 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
1574 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
1575 	}
1576 	if (hwna->na_flags & NAF_MOREFRAG)
1577 		na->na_flags |= NAF_MOREFRAG;
1578 
1579 	nm_prdis("%s<->%s txr %d txd %d rxr %d rxd %d",
1580 		na->name, ifp->if_xname,
1581 		na->num_tx_rings, na->num_tx_desc,
1582 		na->num_rx_rings, na->num_rx_desc);
1583 
1584 	error = netmap_attach_common(na);
1585 	if (error) {
1586 		goto err_put;
1587 	}
1588 	hwna->na_flags |= NAF_BUSY;
1589 	return 0;
1590 
1591 err_put:
1592 	hwna->na_vp = hwna->na_hostvp = NULL;
1593 	netmap_adapter_put(hwna);
1594 	return error;
1595 
1596 }
1597 
1598 struct nm_bridge *
netmap_init_bridges2(u_int n)1599 netmap_init_bridges2(u_int n)
1600 {
1601 	int i;
1602 	struct nm_bridge *b;
1603 
1604 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
1605 	if (b == NULL)
1606 		return NULL;
1607 	for (i = 0; i < n; i++)
1608 		BDG_RWINIT(&b[i]);
1609 	return b;
1610 }
1611 
1612 void
netmap_uninit_bridges2(struct nm_bridge * b,u_int n)1613 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
1614 {
1615 	int i;
1616 
1617 	if (b == NULL)
1618 		return;
1619 
1620 	for (i = 0; i < n; i++)
1621 		BDG_RWDESTROY(&b[i]);
1622 	nm_os_free(b);
1623 }
1624 
1625 int
netmap_init_bridges(void)1626 netmap_init_bridges(void)
1627 {
1628 #ifdef CONFIG_NET_NS
1629 	return netmap_bns_register();
1630 #else
1631 	nm_bridges = netmap_init_bridges2(vale_max_bridges);
1632 	if (nm_bridges == NULL)
1633 		return ENOMEM;
1634 	return 0;
1635 #endif
1636 }
1637 
1638 void
netmap_uninit_bridges(void)1639 netmap_uninit_bridges(void)
1640 {
1641 #ifdef CONFIG_NET_NS
1642 	netmap_bns_unregister();
1643 #else
1644 	netmap_uninit_bridges2(nm_bridges, vale_max_bridges);
1645 #endif
1646 }
1647