1 /*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27 /*
28 * $FreeBSD$
29 *
30 * This module supports memory mapped access to network devices,
31 * see netmap(4).
32 *
33 * The module uses a large, memory pool allocated by the kernel
34 * and accessible as mmapped memory by multiple userspace threads/processes.
35 * The memory pool contains packet buffers and "netmap rings",
36 * i.e. user-accessible copies of the interface's queues.
37 *
38 * Access to the network card works like this:
39 * 1. a process/thread issues one or more open() on /dev/netmap, to create
40 * select()able file descriptor on which events are reported.
41 * 2. on each descriptor, the process issues an ioctl() to identify
42 * the interface that should report events to the file descriptor.
43 * 3. on each descriptor, the process issues an mmap() request to
44 * map the shared memory region within the process' address space.
45 * The list of interesting queues is indicated by a location in
46 * the shared memory region.
47 * 4. using the functions in the netmap(4) userspace API, a process
48 * can look up the occupation state of a queue, access memory buffers,
49 * and retrieve received packets or enqueue packets to transmit.
50 * 5. using some ioctl()s the process can synchronize the userspace view
51 * of the queue with the actual status in the kernel. This includes both
52 * receiving the notification of new packets, and transmitting new
53 * packets on the output interface.
54 * 6. select() or poll() can be used to wait for events on individual
55 * transmit or receive queues (or all queues for a given interface).
56 *
57
58 SYNCHRONIZATION (USER)
59
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67
68 LOCKING (INTERNAL)
69
70 Within the kernel, access to the netmap rings is protected as follows:
71
72 - a spinlock on each ring, to handle producer/consumer races on
73 RX rings attached to the host stack (against multiple host
74 threads writing from the host stack to the same ring),
75 and on 'destination' rings attached to a VALE switch
76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77 protecting multiple active senders for the same destination)
78
79 - an atomic variable to guarantee that there is at most one
80 instance of *_*xsync() on the ring at any time.
81 For rings connected to user file
82 descriptors, an atomic_test_and_set() protects this, and the
83 lock on the ring is not actually used.
84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85 is also used to prevent multiple executions (the driver might indeed
86 already guarantee this).
87 For NIC TX rings connected to a VALE switch, the lock arbitrates
88 access to the queue (both when allocating buffers and when pushing
89 them out).
90
91 - *xsync() should be protected against initializations of the card.
92 On FreeBSD most devices have the reset routine protected by
93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94 the RING protection on rx_reset(), this should be added.
95
96 On linux there is an external lock on the tx path, which probably
97 also arbitrates access to the reset routine. XXX to be revised
98
99 - a per-interface core_lock protecting access from the host stack
100 while interfaces may be detached from netmap mode.
101 XXX there should be no need for this lock if we detach the interfaces
102 only while they are down.
103
104
105 --- VALE SWITCH ---
106
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124
125 */
126
127
128 /* --- internals ----
129 *
130 * Roadmap to the code that implements the above.
131 *
132 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133 * > select()able file descriptor on which events are reported.
134 *
135 * Internally, we allocate a netmap_priv_d structure, that will be
136 * initialized on ioctl(NIOCREGIF).
137 *
138 * os-specific:
139 * FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140 * per-thread.
141 * linux: linux_netmap_open (netmap_linux.c). The priv is
142 * per-open.
143 *
144 * > 2. on each descriptor, the process issues an ioctl() to identify
145 * > the interface that should report events to the file descriptor.
146 *
147 * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148 * Most important things happen in netmap_get_na() and
149 * netmap_do_regif(), called from there. Additional details can be
150 * found in the comments above those functions.
151 *
152 * In all cases, this action creates/takes-a-reference-to a
153 * netmap_*_adapter describing the port, and allocates a netmap_if
154 * and all necessary netmap rings, filling them with netmap buffers.
155 *
156 * In this phase, the sync callbacks for each ring are set (these are used
157 * in steps 5 and 6 below). The callbacks depend on the type of adapter.
158 * The adapter creation/initialization code puts them in the
159 * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they
160 * are copied from there to the netmap_kring's during netmap_do_regif(), by
161 * the nm_krings_create() callback. All the nm_krings_create callbacks
162 * actually call netmap_krings_create() to perform this and the other
163 * common stuff. netmap_krings_create() also takes care of the host rings,
164 * if needed, by setting their sync callbacks appropriately.
165 *
166 * Additional actions depend on the kind of netmap_adapter that has been
167 * registered:
168 *
169 * - netmap_hw_adapter: [netmap.c]
170 * This is a system netdev/ifp with native netmap support.
171 * The ifp is detached from the host stack by redirecting:
172 * - transmissions (from the network stack) to netmap_transmit()
173 * - receive notifications to the nm_notify() callback for
174 * this adapter. The callback is normally netmap_notify(), unless
175 * the ifp is attached to a bridge using bwrap, in which case it
176 * is netmap_bwrap_intr_notify().
177 *
178 * - netmap_generic_adapter: [netmap_generic.c]
179 * A system netdev/ifp without native netmap support.
180 *
181 * (the decision about native/non native support is taken in
182 * netmap_get_hw_na(), called by netmap_get_na())
183 *
184 * - netmap_vp_adapter [netmap_vale.c]
185 * Returned by netmap_get_bdg_na().
186 * This is a persistent or ephemeral VALE port. Ephemeral ports
187 * are created on the fly if they don't already exist, and are
188 * always attached to a bridge.
189 * Persistent VALE ports must must be created seperately, and i
190 * then attached like normal NICs. The NIOCREGIF we are examining
191 * will find them only if they had previosly been created and
192 * attached (see VALE_CTL below).
193 *
194 * - netmap_pipe_adapter [netmap_pipe.c]
195 * Returned by netmap_get_pipe_na().
196 * Both pipe ends are created, if they didn't already exist.
197 *
198 * - netmap_monitor_adapter [netmap_monitor.c]
199 * Returned by netmap_get_monitor_na().
200 * If successful, the nm_sync callbacks of the monitored adapter
201 * will be intercepted by the returned monitor.
202 *
203 * - netmap_bwrap_adapter [netmap_vale.c]
204 * Cannot be obtained in this way, see VALE_CTL below
205 *
206 *
207 * os-specific:
208 * linux: we first go through linux_netmap_ioctl() to
209 * adapt the FreeBSD interface to the linux one.
210 *
211 *
212 * > 3. on each descriptor, the process issues an mmap() request to
213 * > map the shared memory region within the process' address space.
214 * > The list of interesting queues is indicated by a location in
215 * > the shared memory region.
216 *
217 * os-specific:
218 * FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219 * linux: linux_netmap_mmap (netmap_linux.c).
220 *
221 * > 4. using the functions in the netmap(4) userspace API, a process
222 * > can look up the occupation state of a queue, access memory buffers,
223 * > and retrieve received packets or enqueue packets to transmit.
224 *
225 * these actions do not involve the kernel.
226 *
227 * > 5. using some ioctl()s the process can synchronize the userspace view
228 * > of the queue with the actual status in the kernel. This includes both
229 * > receiving the notification of new packets, and transmitting new
230 * > packets on the output interface.
231 *
232 * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233 * cases. They invoke the nm_sync callbacks on the netmap_kring
234 * structures, as initialized in step 2 and maybe later modified
235 * by a monitor. Monitors, however, will always call the original
236 * callback before doing anything else.
237 *
238 *
239 * > 6. select() or poll() can be used to wait for events on individual
240 * > transmit or receive queues (or all queues for a given interface).
241 *
242 * Implemented in netmap_poll(). This will call the same nm_sync()
243 * callbacks as in step 5 above.
244 *
245 * os-specific:
246 * linux: we first go through linux_netmap_poll() to adapt
247 * the FreeBSD interface to the linux one.
248 *
249 *
250 * ---- VALE_CTL -----
251 *
252 * VALE switches are controlled by issuing a NIOCREGIF with a non-null
253 * nr_cmd in the nmreq structure. These subcommands are handled by
254 * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255 * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256 * subcommands, respectively.
257 *
258 * Any network interface known to the system (including a persistent VALE
259 * port) can be attached to a VALE switch by issuing the
260 * NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261 * look exactly like ephemeral VALE ports (as created in step 2 above). The
262 * attachment of other interfaces, instead, requires the creation of a
263 * netmap_bwrap_adapter. Moreover, the attached interface must be put in
264 * netmap mode. This may require the creation of a netmap_generic_adapter if
265 * we have no native support for the interface, or if generic adapters have
266 * been forced by sysctl.
267 *
268 * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269 * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270 * callback. In the case of the bwrap, the callback creates the
271 * netmap_bwrap_adapter. The initialization of the bwrap is then
272 * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273 * callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274 * A generic adapter for the wrapped ifp will be created if needed, when
275 * netmap_get_bdg_na() calls netmap_get_hw_na().
276 *
277 *
278 * ---- DATAPATHS -----
279 *
280 * -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281 *
282 * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283 *
284 * - tx from netmap userspace:
285 * concurrently:
286 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287 * kring->nm_sync() == DEVICE_netmap_txsync()
288 * 2) device interrupt handler
289 * na->nm_notify() == netmap_notify()
290 * - rx from netmap userspace:
291 * concurrently:
292 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293 * kring->nm_sync() == DEVICE_netmap_rxsync()
294 * 2) device interrupt handler
295 * na->nm_notify() == netmap_notify()
296 * - rx from host stack
297 * concurrently:
298 * 1) host stack
299 * netmap_transmit()
300 * na->nm_notify == netmap_notify()
301 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302 * kring->nm_sync() == netmap_rxsync_from_host_compat
303 * netmap_rxsync_from_host(na, NULL, NULL)
304 * - tx to host stack
305 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
306 * kring->nm_sync() == netmap_txsync_to_host_compat
307 * netmap_txsync_to_host(na)
308 * NM_SEND_UP()
309 * FreeBSD: na->if_input() == ?? XXX
310 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311 *
312 *
313 *
314 * -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315 *
316 * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
317 *
318 * - tx from netmap userspace:
319 * concurrently:
320 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
321 * kring->nm_sync() == generic_netmap_txsync()
322 * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
323 * generic_ndo_start_xmit()
324 * orig. dev. start_xmit
325 * FreeBSD: na->if_transmit() == orig. dev if_transmit
326 * 2) generic_mbuf_destructor()
327 * na->nm_notify() == netmap_notify()
328 * - rx from netmap userspace:
329 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
330 * kring->nm_sync() == generic_netmap_rxsync()
331 * mbq_safe_dequeue()
332 * 2) device driver
333 * generic_rx_handler()
334 * mbq_safe_enqueue()
335 * na->nm_notify() == netmap_notify()
336 * - rx from host stack:
337 * concurrently:
338 * 1) host stack
339 * linux: generic_ndo_start_xmit()
340 * netmap_transmit()
341 * FreeBSD: ifp->if_input() == netmap_transmit
342 * both:
343 * na->nm_notify() == netmap_notify()
344 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
345 * kring->nm_sync() == netmap_rxsync_from_host_compat
346 * netmap_rxsync_from_host(na, NULL, NULL)
347 * - tx to host stack:
348 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
349 * kring->nm_sync() == netmap_txsync_to_host_compat
350 * netmap_txsync_to_host(na)
351 * NM_SEND_UP()
352 * FreeBSD: na->if_input() == ??? XXX
353 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX
354 *
355 *
356 * -= VALE =-
357 *
358 * INCOMING:
359 *
360 * - VALE ports:
361 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
362 * kring->nm_sync() == netmap_vp_txsync()
363 *
364 * - system device with native support:
365 * from cable:
366 * interrupt
367 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
368 * kring->nm_sync() == DEVICE_netmap_rxsync()
369 * netmap_vp_txsync()
370 * kring->nm_sync() == DEVICE_netmap_rxsync()
371 * from host stack:
372 * netmap_transmit()
373 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
374 * kring->nm_sync() == netmap_rxsync_from_host_compat()
375 * netmap_vp_txsync()
376 *
377 * - system device with generic support:
378 * from device driver:
379 * generic_rx_handler()
380 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
381 * kring->nm_sync() == generic_netmap_rxsync()
382 * netmap_vp_txsync()
383 * kring->nm_sync() == generic_netmap_rxsync()
384 * from host stack:
385 * netmap_transmit()
386 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
387 * kring->nm_sync() == netmap_rxsync_from_host_compat()
388 * netmap_vp_txsync()
389 *
390 * (all cases) --> nm_bdg_flush()
391 * dest_na->nm_notify() == (see below)
392 *
393 * OUTGOING:
394 *
395 * - VALE ports:
396 * concurrently:
397 * 1) ioctlNIOCRXSYNC)/netmap_poll() in process context
398 * kring->nm_sync() == netmap_vp_rxsync()
399 * 2) from nm_bdg_flush()
400 * na->nm_notify() == netmap_notify()
401 *
402 * - system device with native support:
403 * to cable:
404 * na->nm_notify() == netmap_bwrap_notify()
405 * netmap_vp_rxsync()
406 * kring->nm_sync() == DEVICE_netmap_txsync()
407 * netmap_vp_rxsync()
408 * to host stack:
409 * netmap_vp_rxsync()
410 * kring->nm_sync() == netmap_txsync_to_host_compat
411 * netmap_vp_rxsync_locked()
412 *
413 * - system device with generic adapter:
414 * to device driver:
415 * na->nm_notify() == netmap_bwrap_notify()
416 * netmap_vp_rxsync()
417 * kring->nm_sync() == generic_netmap_txsync()
418 * netmap_vp_rxsync()
419 * to host stack:
420 * netmap_vp_rxsync()
421 * kring->nm_sync() == netmap_txsync_to_host_compat
422 * netmap_vp_rxsync()
423 *
424 */
425
426 /*
427 * OS-specific code that is used only within this file.
428 * Other OS-specific code that must be accessed by drivers
429 * is present in netmap_kern.h
430 */
431
432 #if defined(__FreeBSD__)
433 #include <sys/cdefs.h> /* prerequisite */
434 #include <sys/types.h>
435 #include <sys/errno.h>
436 #include <sys/param.h> /* defines used in kernel.h */
437 #include <sys/kernel.h> /* types used in module initialization */
438 #include <sys/conf.h> /* cdevsw struct, UID, GID */
439 #include <sys/filio.h> /* FIONBIO */
440 #include <sys/sockio.h>
441 #include <sys/socketvar.h> /* struct socket */
442 #include <sys/malloc.h>
443 #include <sys/poll.h>
444 #include <sys/rwlock.h>
445 #include <sys/socket.h> /* sockaddrs */
446 #include <sys/selinfo.h>
447 #include <sys/sysctl.h>
448 #include <sys/jail.h>
449 #include <net/vnet.h>
450 #include <net/if.h>
451 #include <net/if_var.h>
452 #include <net/bpf.h> /* BIOCIMMEDIATE */
453 #include <machine/bus.h> /* bus_dmamap_* */
454 #include <sys/endian.h>
455 #include <sys/refcount.h>
456
457
458 /* reduce conditional code */
459 // linux API, use for the knlist in FreeBSD
460 /* use a private mutex for the knlist */
461 #define init_waitqueue_head(x) do { \
462 struct mtx *m = &(x)->m; \
463 mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); \
464 knlist_init_mtx(&(x)->si.si_note, m); \
465 } while (0)
466
467 #define OS_selrecord(a, b) selrecord(a, &((b)->si))
468 #define OS_selwakeup(a, b) freebsd_selwakeup(a, b)
469
470 #elif defined(linux)
471
472 #include "bsd_glue.h"
473
474
475
476 #elif defined(__APPLE__)
477
478 #warning OSX support is only partial
479 #include "osx_glue.h"
480
481 #else
482
483 #error Unsupported platform
484
485 #endif /* unsupported */
486
487 /*
488 * common headers
489 */
490 #include <net/netmap.h>
491 #include <dev/netmap/netmap_kern.h>
492 #include <dev/netmap/netmap_mem2.h>
493
494
495 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
496
497 /* user-controlled variables */
498 int netmap_verbose;
499
500 static int netmap_no_timestamp; /* don't timestamp on rxsync */
501
502 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
503 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
504 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
505 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
506 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
507 int netmap_mitigate = 1;
508 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
509 int netmap_no_pendintr = 1;
510 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
511 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
512 int netmap_txsync_retry = 2;
513 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
514 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
515
516 int netmap_adaptive_io = 0;
517 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
518 &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
519
520 int netmap_flags = 0; /* debug flags */
521 int netmap_fwd = 0; /* force transparent mode */
522
523 /*
524 * netmap_admode selects the netmap mode to use.
525 * Invalid values are reset to NETMAP_ADMODE_BEST
526 */
527 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
528 NETMAP_ADMODE_NATIVE, /* either native or none */
529 NETMAP_ADMODE_GENERIC, /* force generic */
530 NETMAP_ADMODE_LAST };
531 static int netmap_admode = NETMAP_ADMODE_BEST;
532
533 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */
534 int netmap_generic_ringsize = 1024; /* Generic ringsize. */
535 int netmap_generic_rings = 1; /* number of queues in generic. */
536
537 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
538 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
539 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
540 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
541 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
543
544 NMG_LOCK_T netmap_global_lock;
545 int netmap_use_count = 0; /* number of active netmap instances */
546
547 /*
548 * mark the ring as stopped, and run through the locks
549 * to make sure other users get to see it.
550 */
551 static void
netmap_disable_ring(struct netmap_kring * kr)552 netmap_disable_ring(struct netmap_kring *kr)
553 {
554 kr->nkr_stopped = 1;
555 nm_kr_get(kr);
556 mtx_lock(&kr->q_lock);
557 mtx_unlock(&kr->q_lock);
558 nm_kr_put(kr);
559 }
560
561 /* stop or enable a single ring */
562 void
netmap_set_ring(struct netmap_adapter * na,u_int ring_id,enum txrx t,int stopped)563 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
564 {
565 if (stopped)
566 netmap_disable_ring(NMR(na, t) + ring_id);
567 else
568 NMR(na, t)[ring_id].nkr_stopped = 0;
569 }
570
571
572 /* stop or enable all the rings of na */
573 void
netmap_set_all_rings(struct netmap_adapter * na,int stopped)574 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
575 {
576 int i;
577 enum txrx t;
578
579 if (!nm_netmap_on(na))
580 return;
581
582 for_rx_tx(t) {
583 for (i = 0; i < netmap_real_rings(na, t); i++) {
584 netmap_set_ring(na, i, t, stopped);
585 }
586 }
587 }
588
589 /*
590 * Convenience function used in drivers. Waits for current txsync()s/rxsync()s
591 * to finish and prevents any new one from starting. Call this before turning
592 * netmap mode off, or before removing the hardware rings (e.g., on module
593 * onload). As a rule of thumb for linux drivers, this should be placed near
594 * each napi_disable().
595 */
596 void
netmap_disable_all_rings(struct ifnet * ifp)597 netmap_disable_all_rings(struct ifnet *ifp)
598 {
599 netmap_set_all_rings(NA(ifp), 1 /* stopped */);
600 }
601
602 /*
603 * Convenience function used in drivers. Re-enables rxsync and txsync on the
604 * adapter's rings In linux drivers, this should be placed near each
605 * napi_enable().
606 */
607 void
netmap_enable_all_rings(struct ifnet * ifp)608 netmap_enable_all_rings(struct ifnet *ifp)
609 {
610 netmap_set_all_rings(NA(ifp), 0 /* enabled */);
611 }
612
613
614 /*
615 * generic bound_checking function
616 */
617 u_int
nm_bound_var(u_int * v,u_int dflt,u_int lo,u_int hi,const char * msg)618 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
619 {
620 u_int oldv = *v;
621 const char *op = NULL;
622
623 if (dflt < lo)
624 dflt = lo;
625 if (dflt > hi)
626 dflt = hi;
627 if (oldv < lo) {
628 *v = dflt;
629 op = "Bump";
630 } else if (oldv > hi) {
631 *v = hi;
632 op = "Clamp";
633 }
634 if (op && msg)
635 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
636 return *v;
637 }
638
639
640 /*
641 * packet-dump function, user-supplied or static buffer.
642 * The destination buffer must be at least 30+4*len
643 */
644 const char *
nm_dump_buf(char * p,int len,int lim,char * dst)645 nm_dump_buf(char *p, int len, int lim, char *dst)
646 {
647 static char _dst[8192];
648 int i, j, i0;
649 static char hex[] ="0123456789abcdef";
650 char *o; /* output position */
651
652 #define P_HI(x) hex[((x) & 0xf0)>>4]
653 #define P_LO(x) hex[((x) & 0xf)]
654 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
655 if (!dst)
656 dst = _dst;
657 if (lim <= 0 || lim > len)
658 lim = len;
659 o = dst;
660 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
661 o += strlen(o);
662 /* hexdump routine */
663 for (i = 0; i < lim; ) {
664 sprintf(o, "%5d: ", i);
665 o += strlen(o);
666 memset(o, ' ', 48);
667 i0 = i;
668 for (j=0; j < 16 && i < lim; i++, j++) {
669 o[j*3] = P_HI(p[i]);
670 o[j*3+1] = P_LO(p[i]);
671 }
672 i = i0;
673 for (j=0; j < 16 && i < lim; i++, j++)
674 o[j + 48] = P_C(p[i]);
675 o[j+48] = '\n';
676 o += j+49;
677 }
678 *o = '\0';
679 #undef P_HI
680 #undef P_LO
681 #undef P_C
682 return dst;
683 }
684
685
686 /*
687 * Fetch configuration from the device, to cope with dynamic
688 * reconfigurations after loading the module.
689 */
690 /* call with NMG_LOCK held */
691 int
netmap_update_config(struct netmap_adapter * na)692 netmap_update_config(struct netmap_adapter *na)
693 {
694 u_int txr, txd, rxr, rxd;
695
696 txr = txd = rxr = rxd = 0;
697 if (na->nm_config == NULL ||
698 na->nm_config(na, &txr, &txd, &rxr, &rxd))
699 {
700 /* take whatever we had at init time */
701 txr = na->num_tx_rings;
702 txd = na->num_tx_desc;
703 rxr = na->num_rx_rings;
704 rxd = na->num_rx_desc;
705 }
706
707 if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
708 na->num_rx_rings == rxr && na->num_rx_desc == rxd)
709 return 0; /* nothing changed */
710 if (netmap_verbose || na->active_fds > 0) {
711 D("stored config %s: txring %d x %d, rxring %d x %d",
712 na->name,
713 na->num_tx_rings, na->num_tx_desc,
714 na->num_rx_rings, na->num_rx_desc);
715 D("new config %s: txring %d x %d, rxring %d x %d",
716 na->name, txr, txd, rxr, rxd);
717 }
718 if (na->active_fds == 0) {
719 D("configuration changed (but fine)");
720 na->num_tx_rings = txr;
721 na->num_tx_desc = txd;
722 na->num_rx_rings = rxr;
723 na->num_rx_desc = rxd;
724 return 0;
725 }
726 D("configuration changed while active, this is bad...");
727 return 1;
728 }
729
730 static void netmap_txsync_to_host(struct netmap_adapter *na);
731 static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
732
733 /* kring->nm_sync callback for the host tx ring */
734 static int
netmap_txsync_to_host_compat(struct netmap_kring * kring,int flags)735 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
736 {
737 (void)flags; /* unused */
738 netmap_txsync_to_host(kring->na);
739 return 0;
740 }
741
742 /* kring->nm_sync callback for the host rx ring */
743 static int
netmap_rxsync_from_host_compat(struct netmap_kring * kring,int flags)744 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
745 {
746 (void)flags; /* unused */
747 netmap_rxsync_from_host(kring->na, NULL, NULL);
748 return 0;
749 }
750
751
752
753 /* create the krings array and initialize the fields common to all adapters.
754 * The array layout is this:
755 *
756 * +----------+
757 * na->tx_rings ----->| | \
758 * | | } na->num_tx_ring
759 * | | /
760 * +----------+
761 * | | host tx kring
762 * na->rx_rings ----> +----------+
763 * | | \
764 * | | } na->num_rx_rings
765 * | | /
766 * +----------+
767 * | | host rx kring
768 * +----------+
769 * na->tailroom ----->| | \
770 * | | } tailroom bytes
771 * | | /
772 * +----------+
773 *
774 * Note: for compatibility, host krings are created even when not needed.
775 * The tailroom space is currently used by vale ports for allocating leases.
776 */
777 /* call with NMG_LOCK held */
778 int
netmap_krings_create(struct netmap_adapter * na,u_int tailroom)779 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
780 {
781 u_int i, len, ndesc;
782 struct netmap_kring *kring;
783 u_int n[NR_TXRX];
784 enum txrx t;
785
786 /* account for the (possibly fake) host rings */
787 n[NR_TX] = na->num_tx_rings + 1;
788 n[NR_RX] = na->num_rx_rings + 1;
789
790 len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom;
791
792 na->tx_rings = malloc((size_t)len, M_NETMAP, M_NOWAIT | M_ZERO);
793 if (na->tx_rings == NULL) {
794 D("Cannot allocate krings");
795 return ENOMEM;
796 }
797 na->rx_rings = na->tx_rings + n[NR_TX];
798
799 /*
800 * All fields in krings are 0 except the one initialized below.
801 * but better be explicit on important kring fields.
802 */
803 for_rx_tx(t) {
804 ndesc = nma_get_ndesc(na, t);
805 for (i = 0; i < n[t]; i++) {
806 kring = &NMR(na, t)[i];
807 bzero(kring, sizeof(*kring));
808 kring->na = na;
809 kring->ring_id = i;
810 kring->tx = t;
811 kring->nkr_num_slots = ndesc;
812 if (i < nma_get_nrings(na, t)) {
813 kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
814 } else if (i == na->num_tx_rings) {
815 kring->nm_sync = (t == NR_TX ?
816 netmap_txsync_to_host_compat :
817 netmap_rxsync_from_host_compat);
818 }
819 kring->nm_notify = na->nm_notify;
820 kring->rhead = kring->rcur = kring->nr_hwcur = 0;
821 /*
822 * IMPORTANT: Always keep one slot empty.
823 */
824 kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
825 snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
826 nm_txrx2str(t), i);
827 ND("ktx %s h %d c %d t %d",
828 kring->name, kring->rhead, kring->rcur, kring->rtail);
829 mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
830 init_waitqueue_head(&kring->si);
831 }
832 init_waitqueue_head(&na->si[t]);
833 }
834
835 na->tailroom = na->rx_rings + n[NR_RX];
836
837 return 0;
838 }
839
840
841 #ifdef __FreeBSD__
842 static void
netmap_knlist_destroy(NM_SELINFO_T * si)843 netmap_knlist_destroy(NM_SELINFO_T *si)
844 {
845 /* XXX kqueue(9) needed; these will mirror knlist_init. */
846 knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
847 knlist_destroy(&si->si.si_note);
848 /* now we don't need the mutex anymore */
849 mtx_destroy(&si->m);
850 }
851 #endif /* __FreeBSD__ */
852
853
854 /* undo the actions performed by netmap_krings_create */
855 /* call with NMG_LOCK held */
856 void
netmap_krings_delete(struct netmap_adapter * na)857 netmap_krings_delete(struct netmap_adapter *na)
858 {
859 struct netmap_kring *kring = na->tx_rings;
860 enum txrx t;
861
862 for_rx_tx(t)
863 netmap_knlist_destroy(&na->si[t]);
864
865 /* we rely on the krings layout described above */
866 for ( ; kring != na->tailroom; kring++) {
867 mtx_destroy(&kring->q_lock);
868 netmap_knlist_destroy(&kring->si);
869 }
870 free(na->tx_rings, M_NETMAP);
871 na->tx_rings = na->rx_rings = na->tailroom = NULL;
872 }
873
874
875 /*
876 * Destructor for NIC ports. They also have an mbuf queue
877 * on the rings connected to the host so we need to purge
878 * them first.
879 */
880 /* call with NMG_LOCK held */
881 static void
netmap_hw_krings_delete(struct netmap_adapter * na)882 netmap_hw_krings_delete(struct netmap_adapter *na)
883 {
884 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
885
886 ND("destroy sw mbq with len %d", mbq_len(q));
887 mbq_purge(q);
888 mbq_safe_destroy(q);
889 netmap_krings_delete(na);
890 }
891
892
893
894 /*
895 * Undo everything that was done in netmap_do_regif(). In particular,
896 * call nm_register(ifp,0) to stop netmap mode on the interface and
897 * revert to normal operation.
898 */
899 /* call with NMG_LOCK held */
900 static void netmap_unset_ringid(struct netmap_priv_d *);
901 static void netmap_rel_exclusive(struct netmap_priv_d *);
902 static void
netmap_do_unregif(struct netmap_priv_d * priv)903 netmap_do_unregif(struct netmap_priv_d *priv)
904 {
905 struct netmap_adapter *na = priv->np_na;
906
907 NMG_LOCK_ASSERT();
908 na->active_fds--;
909 /* release exclusive use if it was requested on regif */
910 netmap_rel_exclusive(priv);
911 if (na->active_fds <= 0) { /* last instance */
912
913 if (netmap_verbose)
914 D("deleting last instance for %s", na->name);
915
916 #ifdef WITH_MONITOR
917 /* walk through all the rings and tell any monitor
918 * that the port is going to exit netmap mode
919 */
920 netmap_monitor_stop(na);
921 #endif
922 /*
923 * (TO CHECK) This function is only called
924 * when the last reference to this file descriptor goes
925 * away. This means we cannot have any pending poll()
926 * or interrupt routine operating on the structure.
927 * XXX The file may be closed in a thread while
928 * another thread is using it.
929 * Linux keeps the file opened until the last reference
930 * by any outstanding ioctl/poll or mmap is gone.
931 * FreeBSD does not track mmap()s (but we do) and
932 * wakes up any sleeping poll(). Need to check what
933 * happens if the close() occurs while a concurrent
934 * syscall is running.
935 */
936 na->nm_register(na, 0); /* off, clear flags */
937 /* Wake up any sleeping threads. netmap_poll will
938 * then return POLLERR
939 * XXX The wake up now must happen during *_down(), when
940 * we order all activities to stop. -gl
941 */
942 /* delete rings and buffers */
943 netmap_mem_rings_delete(na);
944 na->nm_krings_delete(na);
945 }
946 /* possibily decrement counter of tx_si/rx_si users */
947 netmap_unset_ringid(priv);
948 /* delete the nifp */
949 netmap_mem_if_delete(na, priv->np_nifp);
950 /* drop the allocator */
951 netmap_mem_deref(na->nm_mem, na);
952 /* mark the priv as unregistered */
953 priv->np_na = NULL;
954 priv->np_nifp = NULL;
955 }
956
957 /* call with NMG_LOCK held */
958 static __inline int
nm_si_user(struct netmap_priv_d * priv,enum txrx t)959 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
960 {
961 return (priv->np_na != NULL &&
962 (priv->np_qlast[t] - priv->np_qfirst[t] > 1));
963 }
964
965 /*
966 * Destructor of the netmap_priv_d, called when the fd is closed
967 * Action: undo all the things done by NIOCREGIF,
968 * On FreeBSD we need to track whether there are active mmap()s,
969 * and we use np_active_mmaps for that. On linux, the field is always 0.
970 * Return: 1 if we can free priv, 0 otherwise.
971 *
972 */
973 /* call with NMG_LOCK held */
974 int
netmap_dtor_locked(struct netmap_priv_d * priv)975 netmap_dtor_locked(struct netmap_priv_d *priv)
976 {
977 struct netmap_adapter *na = priv->np_na;
978
979 /* number of active references to this fd */
980 if (--priv->np_refs > 0) {
981 return 0;
982 }
983 netmap_use_count--;
984 if (!na) {
985 return 1; //XXX is it correct?
986 }
987 netmap_do_unregif(priv);
988 netmap_adapter_put(na);
989 return 1;
990 }
991
992
993 /* call with NMG_LOCK *not* held */
994 void
netmap_dtor(void * data)995 netmap_dtor(void *data)
996 {
997 struct netmap_priv_d *priv = data;
998 int last_instance;
999
1000 NMG_LOCK();
1001 last_instance = netmap_dtor_locked(priv);
1002 NMG_UNLOCK();
1003 if (last_instance) {
1004 bzero(priv, sizeof(*priv)); /* for safety */
1005 free(priv, M_NETMAP);
1006 }
1007 }
1008
1009
1010
1011
1012 /*
1013 * Handlers for synchronization of the queues from/to the host.
1014 * Netmap has two operating modes:
1015 * - in the default mode, the rings connected to the host stack are
1016 * just another ring pair managed by userspace;
1017 * - in transparent mode (XXX to be defined) incoming packets
1018 * (from the host or the NIC) are marked as NS_FORWARD upon
1019 * arrival, and the user application has a chance to reset the
1020 * flag for packets that should be dropped.
1021 * On the RXSYNC or poll(), packets in RX rings between
1022 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1023 * to the other side.
1024 * The transfer NIC --> host is relatively easy, just encapsulate
1025 * into mbufs and we are done. The host --> NIC side is slightly
1026 * harder because there might not be room in the tx ring so it
1027 * might take a while before releasing the buffer.
1028 */
1029
1030
1031 /*
1032 * pass a chain of buffers to the host stack as coming from 'dst'
1033 * We do not need to lock because the queue is private.
1034 */
1035 static void
netmap_send_up(struct ifnet * dst,struct mbq * q)1036 netmap_send_up(struct ifnet *dst, struct mbq *q)
1037 {
1038 struct mbuf *m;
1039
1040 /* send packets up, outside the lock */
1041 while ((m = mbq_dequeue(q)) != NULL) {
1042 if (netmap_verbose & NM_VERB_HOST)
1043 D("sending up pkt %p size %d", m, MBUF_LEN(m));
1044 NM_SEND_UP(dst, m);
1045 }
1046 mbq_destroy(q);
1047 }
1048
1049
1050 /*
1051 * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1052 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1053 * and pass them up. Drop remaining packets in the unlikely event
1054 * of an mbuf shortage.
1055 */
1056 static void
netmap_grab_packets(struct netmap_kring * kring,struct mbq * q,int force)1057 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1058 {
1059 u_int const lim = kring->nkr_num_slots - 1;
1060 u_int const head = kring->rhead;
1061 u_int n;
1062 struct netmap_adapter *na = kring->na;
1063
1064 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1065 struct mbuf *m;
1066 struct netmap_slot *slot = &kring->ring->slot[n];
1067
1068 if ((slot->flags & NS_FORWARD) == 0 && !force)
1069 continue;
1070 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1071 RD(5, "bad pkt at %d len %d", n, slot->len);
1072 continue;
1073 }
1074 slot->flags &= ~NS_FORWARD; // XXX needed ?
1075 /* XXX TODO: adapt to the case of a multisegment packet */
1076 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1077
1078 if (m == NULL)
1079 break;
1080 mbq_enqueue(q, m);
1081 }
1082 }
1083
1084
1085 /*
1086 * Send to the NIC rings packets marked NS_FORWARD between
1087 * kring->nr_hwcur and kring->rhead
1088 * Called under kring->rx_queue.lock on the sw rx ring,
1089 */
1090 static u_int
netmap_sw_to_nic(struct netmap_adapter * na)1091 netmap_sw_to_nic(struct netmap_adapter *na)
1092 {
1093 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1094 struct netmap_slot *rxslot = kring->ring->slot;
1095 u_int i, rxcur = kring->nr_hwcur;
1096 u_int const head = kring->rhead;
1097 u_int const src_lim = kring->nkr_num_slots - 1;
1098 u_int sent = 0;
1099
1100 /* scan rings to find space, then fill as much as possible */
1101 for (i = 0; i < na->num_tx_rings; i++) {
1102 struct netmap_kring *kdst = &na->tx_rings[i];
1103 struct netmap_ring *rdst = kdst->ring;
1104 u_int const dst_lim = kdst->nkr_num_slots - 1;
1105
1106 /* XXX do we trust ring or kring->rcur,rtail ? */
1107 for (; rxcur != head && !nm_ring_empty(rdst);
1108 rxcur = nm_next(rxcur, src_lim) ) {
1109 struct netmap_slot *src, *dst, tmp;
1110 u_int dst_cur = rdst->cur;
1111
1112 src = &rxslot[rxcur];
1113 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1114 continue;
1115
1116 sent++;
1117
1118 dst = &rdst->slot[dst_cur];
1119
1120 tmp = *src;
1121
1122 src->buf_idx = dst->buf_idx;
1123 src->flags = NS_BUF_CHANGED;
1124
1125 dst->buf_idx = tmp.buf_idx;
1126 dst->len = tmp.len;
1127 dst->flags = NS_BUF_CHANGED;
1128
1129 rdst->cur = nm_next(dst_cur, dst_lim);
1130 }
1131 /* if (sent) XXX txsync ? */
1132 }
1133 return sent;
1134 }
1135
1136
1137 /*
1138 * netmap_txsync_to_host() passes packets up. We are called from a
1139 * system call in user process context, and the only contention
1140 * can be among multiple user threads erroneously calling
1141 * this routine concurrently.
1142 */
1143 static void
netmap_txsync_to_host(struct netmap_adapter * na)1144 netmap_txsync_to_host(struct netmap_adapter *na)
1145 {
1146 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1147 u_int const lim = kring->nkr_num_slots - 1;
1148 u_int const head = kring->rhead;
1149 struct mbq q;
1150
1151 /* Take packets from hwcur to head and pass them up.
1152 * force head = cur since netmap_grab_packets() stops at head
1153 * In case of no buffers we give up. At the end of the loop,
1154 * the queue is drained in all cases.
1155 */
1156 mbq_init(&q);
1157 netmap_grab_packets(kring, &q, 1 /* force */);
1158 ND("have %d pkts in queue", mbq_len(&q));
1159 kring->nr_hwcur = head;
1160 kring->nr_hwtail = head + lim;
1161 if (kring->nr_hwtail > lim)
1162 kring->nr_hwtail -= lim + 1;
1163
1164 netmap_send_up(na->ifp, &q);
1165 }
1166
1167
1168 /*
1169 * rxsync backend for packets coming from the host stack.
1170 * They have been put in kring->rx_queue by netmap_transmit().
1171 * We protect access to the kring using kring->rx_queue.lock
1172 *
1173 * This routine also does the selrecord if called from the poll handler
1174 * (we know because td != NULL).
1175 *
1176 * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1177 * as an additional hidden argument.
1178 * returns the number of packets delivered to tx queues in
1179 * transparent mode, or a negative value if error
1180 */
1181 static int
netmap_rxsync_from_host(struct netmap_adapter * na,struct thread * td,void * pwait)1182 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1183 {
1184 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1185 struct netmap_ring *ring = kring->ring;
1186 u_int nm_i, n;
1187 u_int const lim = kring->nkr_num_slots - 1;
1188 u_int const head = kring->rhead;
1189 int ret = 0;
1190 struct mbq *q = &kring->rx_queue, fq;
1191
1192 (void)pwait; /* disable unused warnings */
1193 (void)td;
1194
1195 mbq_init(&fq); /* fq holds packets to be freed */
1196
1197 mbq_lock(q);
1198
1199 /* First part: import newly received packets */
1200 n = mbq_len(q);
1201 if (n) { /* grab packets from the queue */
1202 struct mbuf *m;
1203 uint32_t stop_i;
1204
1205 nm_i = kring->nr_hwtail;
1206 stop_i = nm_prev(nm_i, lim);
1207 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1208 int len = MBUF_LEN(m);
1209 struct netmap_slot *slot = &ring->slot[nm_i];
1210
1211 m_copydata(m, 0, len, NMB(na, slot));
1212 ND("nm %d len %d", nm_i, len);
1213 if (netmap_verbose)
1214 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1215
1216 slot->len = len;
1217 slot->flags = kring->nkr_slot_flags;
1218 nm_i = nm_next(nm_i, lim);
1219 mbq_enqueue(&fq, m);
1220 }
1221 kring->nr_hwtail = nm_i;
1222 }
1223
1224 /*
1225 * Second part: skip past packets that userspace has released.
1226 */
1227 nm_i = kring->nr_hwcur;
1228 if (nm_i != head) { /* something was released */
1229 if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1230 ret = netmap_sw_to_nic(na);
1231 kring->nr_hwcur = head;
1232 }
1233
1234 /* access copies of cur,tail in the kring */
1235 if (kring->rcur == kring->rtail && td) /* no bufs available */
1236 OS_selrecord(td, &kring->si);
1237
1238 mbq_unlock(q);
1239
1240 mbq_purge(&fq);
1241 mbq_destroy(&fq);
1242
1243 return ret;
1244 }
1245
1246
1247 /* Get a netmap adapter for the port.
1248 *
1249 * If it is possible to satisfy the request, return 0
1250 * with *na containing the netmap adapter found.
1251 * Otherwise return an error code, with *na containing NULL.
1252 *
1253 * When the port is attached to a bridge, we always return
1254 * EBUSY.
1255 * Otherwise, if the port is already bound to a file descriptor,
1256 * then we unconditionally return the existing adapter into *na.
1257 * In all the other cases, we return (into *na) either native,
1258 * generic or NULL, according to the following table:
1259 *
1260 * native_support
1261 * active_fds dev.netmap.admode YES NO
1262 * -------------------------------------------------------
1263 * >0 * NA(ifp) NA(ifp)
1264 *
1265 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC
1266 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL
1267 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC
1268 *
1269 */
1270
1271 int
netmap_get_hw_na(struct ifnet * ifp,struct netmap_adapter ** na)1272 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1273 {
1274 /* generic support */
1275 int i = netmap_admode; /* Take a snapshot. */
1276 struct netmap_adapter *prev_na;
1277 #ifdef WITH_GENERIC
1278 struct netmap_generic_adapter *gna;
1279 int error = 0;
1280 #endif
1281
1282 *na = NULL; /* default */
1283
1284 /* reset in case of invalid value */
1285 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1286 i = netmap_admode = NETMAP_ADMODE_BEST;
1287
1288 if (NETMAP_CAPABLE(ifp)) {
1289 prev_na = NA(ifp);
1290 /* If an adapter already exists, return it if
1291 * there are active file descriptors or if
1292 * netmap is not forced to use generic
1293 * adapters.
1294 */
1295 if (NETMAP_OWNED_BY_ANY(prev_na)
1296 || i != NETMAP_ADMODE_GENERIC
1297 || prev_na->na_flags & NAF_FORCE_NATIVE
1298 #ifdef WITH_PIPES
1299 /* ugly, but we cannot allow an adapter switch
1300 * if some pipe is referring to this one
1301 */
1302 || prev_na->na_next_pipe > 0
1303 #endif
1304 ) {
1305 *na = prev_na;
1306 return 0;
1307 }
1308 }
1309
1310 /* If there isn't native support and netmap is not allowed
1311 * to use generic adapters, we cannot satisfy the request.
1312 */
1313 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1314 return EOPNOTSUPP;
1315
1316 #ifdef WITH_GENERIC
1317 /* Otherwise, create a generic adapter and return it,
1318 * saving the previously used netmap adapter, if any.
1319 *
1320 * Note that here 'prev_na', if not NULL, MUST be a
1321 * native adapter, and CANNOT be a generic one. This is
1322 * true because generic adapters are created on demand, and
1323 * destroyed when not used anymore. Therefore, if the adapter
1324 * currently attached to an interface 'ifp' is generic, it
1325 * must be that
1326 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1327 * Consequently, if NA(ifp) is generic, we will enter one of
1328 * the branches above. This ensures that we never override
1329 * a generic adapter with another generic adapter.
1330 */
1331 prev_na = NA(ifp);
1332 error = generic_netmap_attach(ifp);
1333 if (error)
1334 return error;
1335
1336 *na = NA(ifp);
1337 gna = (struct netmap_generic_adapter*)NA(ifp);
1338 gna->prev = prev_na; /* save old na */
1339 if (prev_na != NULL) {
1340 ifunit_ref(ifp->if_xname);
1341 // XXX add a refcount ?
1342 netmap_adapter_get(prev_na);
1343 }
1344 ND("Created generic NA %p (prev %p)", gna, gna->prev);
1345
1346 return 0;
1347 #else /* !WITH_GENERIC */
1348 return EOPNOTSUPP;
1349 #endif
1350 }
1351
1352
1353 /*
1354 * MUST BE CALLED UNDER NMG_LOCK()
1355 *
1356 * Get a refcounted reference to a netmap adapter attached
1357 * to the interface specified by nmr.
1358 * This is always called in the execution of an ioctl().
1359 *
1360 * Return ENXIO if the interface specified by the request does
1361 * not exist, ENOTSUP if netmap is not supported by the interface,
1362 * EBUSY if the interface is already attached to a bridge,
1363 * EINVAL if parameters are invalid, ENOMEM if needed resources
1364 * could not be allocated.
1365 * If successful, hold a reference to the netmap adapter.
1366 *
1367 * No reference is kept on the real interface, which may then
1368 * disappear at any time.
1369 */
1370 int
netmap_get_na(struct nmreq * nmr,struct netmap_adapter ** na,int create)1371 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1372 {
1373 struct ifnet *ifp = NULL;
1374 int error = 0;
1375 struct netmap_adapter *ret = NULL;
1376
1377 *na = NULL; /* default return value */
1378
1379 NMG_LOCK_ASSERT();
1380
1381 /* we cascade through all possibile types of netmap adapter.
1382 * All netmap_get_*_na() functions return an error and an na,
1383 * with the following combinations:
1384 *
1385 * error na
1386 * 0 NULL type doesn't match
1387 * !0 NULL type matches, but na creation/lookup failed
1388 * 0 !NULL type matches and na created/found
1389 * !0 !NULL impossible
1390 */
1391
1392 /* try to see if this is a monitor port */
1393 error = netmap_get_monitor_na(nmr, na, create);
1394 if (error || *na != NULL)
1395 return error;
1396
1397 /* try to see if this is a pipe port */
1398 error = netmap_get_pipe_na(nmr, na, create);
1399 if (error || *na != NULL)
1400 return error;
1401
1402 /* try to see if this is a bridge port */
1403 error = netmap_get_bdg_na(nmr, na, create);
1404 if (error)
1405 return error;
1406
1407 if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1408 goto out;
1409
1410 /*
1411 * This must be a hardware na, lookup the name in the system.
1412 * Note that by hardware we actually mean "it shows up in ifconfig".
1413 * This may still be a tap, a veth/epair, or even a
1414 * persistent VALE port.
1415 */
1416 ifp = ifunit_ref(nmr->nr_name);
1417 if (ifp == NULL) {
1418 return ENXIO;
1419 }
1420
1421 error = netmap_get_hw_na(ifp, &ret);
1422 if (error)
1423 goto out;
1424
1425 *na = ret;
1426 netmap_adapter_get(ret);
1427
1428 out:
1429 if (error && ret != NULL)
1430 netmap_adapter_put(ret);
1431
1432 if (ifp)
1433 if_rele(ifp); /* allow live unloading of drivers modules */
1434
1435 return error;
1436 }
1437
1438
1439 /*
1440 * validate parameters on entry for *_txsync()
1441 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1442 * in case of error.
1443 *
1444 * rhead, rcur and rtail=hwtail are stored from previous round.
1445 * hwcur is the next packet to send to the ring.
1446 *
1447 * We want
1448 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1449 *
1450 * hwcur, rhead, rtail and hwtail are reliable
1451 */
1452 static u_int
nm_txsync_prologue(struct netmap_kring * kring)1453 nm_txsync_prologue(struct netmap_kring *kring)
1454 {
1455 #define NM_ASSERT(t) if (t) { D("fail " #t); goto error; }
1456 struct netmap_ring *ring = kring->ring;
1457 u_int head = ring->head; /* read only once */
1458 u_int cur = ring->cur; /* read only once */
1459 u_int n = kring->nkr_num_slots;
1460
1461 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1462 kring->name,
1463 kring->nr_hwcur, kring->nr_hwtail,
1464 ring->head, ring->cur, ring->tail);
1465 #if 1 /* kernel sanity checks; but we can trust the kring. */
1466 if (kring->nr_hwcur >= n || kring->rhead >= n ||
1467 kring->rtail >= n || kring->nr_hwtail >= n)
1468 goto error;
1469 #endif /* kernel sanity checks */
1470 /*
1471 * user sanity checks. We only use 'cur',
1472 * A, B, ... are possible positions for cur:
1473 *
1474 * 0 A cur B tail C n-1
1475 * 0 D tail E cur F n-1
1476 *
1477 * B, F, D are valid. A, C, E are wrong
1478 */
1479 if (kring->rtail >= kring->rhead) {
1480 /* want rhead <= head <= rtail */
1481 NM_ASSERT(head < kring->rhead || head > kring->rtail);
1482 /* and also head <= cur <= rtail */
1483 NM_ASSERT(cur < head || cur > kring->rtail);
1484 } else { /* here rtail < rhead */
1485 /* we need head outside rtail .. rhead */
1486 NM_ASSERT(head > kring->rtail && head < kring->rhead);
1487
1488 /* two cases now: head <= rtail or head >= rhead */
1489 if (head <= kring->rtail) {
1490 /* want head <= cur <= rtail */
1491 NM_ASSERT(cur < head || cur > kring->rtail);
1492 } else { /* head >= rhead */
1493 /* cur must be outside rtail..head */
1494 NM_ASSERT(cur > kring->rtail && cur < head);
1495 }
1496 }
1497 if (ring->tail != kring->rtail) {
1498 RD(5, "tail overwritten was %d need %d",
1499 ring->tail, kring->rtail);
1500 ring->tail = kring->rtail;
1501 }
1502 kring->rhead = head;
1503 kring->rcur = cur;
1504 return head;
1505
1506 error:
1507 RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d",
1508 kring->name,
1509 head, cur, ring->tail,
1510 kring->rhead, kring->rcur, kring->rtail,
1511 kring->nr_hwcur, kring->nr_hwtail);
1512 return n;
1513 #undef NM_ASSERT
1514 }
1515
1516
1517 /*
1518 * validate parameters on entry for *_rxsync()
1519 * Returns ring->head if ok, kring->nkr_num_slots on error.
1520 *
1521 * For a valid configuration,
1522 * hwcur <= head <= cur <= tail <= hwtail
1523 *
1524 * We only consider head and cur.
1525 * hwcur and hwtail are reliable.
1526 *
1527 */
1528 static u_int
nm_rxsync_prologue(struct netmap_kring * kring)1529 nm_rxsync_prologue(struct netmap_kring *kring)
1530 {
1531 struct netmap_ring *ring = kring->ring;
1532 uint32_t const n = kring->nkr_num_slots;
1533 uint32_t head, cur;
1534
1535 ND(5,"%s kc %d kt %d h %d c %d t %d",
1536 kring->name,
1537 kring->nr_hwcur, kring->nr_hwtail,
1538 ring->head, ring->cur, ring->tail);
1539 /*
1540 * Before storing the new values, we should check they do not
1541 * move backwards. However:
1542 * - head is not an issue because the previous value is hwcur;
1543 * - cur could in principle go back, however it does not matter
1544 * because we are processing a brand new rxsync()
1545 */
1546 cur = kring->rcur = ring->cur; /* read only once */
1547 head = kring->rhead = ring->head; /* read only once */
1548 #if 1 /* kernel sanity checks */
1549 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1550 goto error;
1551 #endif /* kernel sanity checks */
1552 /* user sanity checks */
1553 if (kring->nr_hwtail >= kring->nr_hwcur) {
1554 /* want hwcur <= rhead <= hwtail */
1555 if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1556 goto error;
1557 /* and also rhead <= rcur <= hwtail */
1558 if (cur < head || cur > kring->nr_hwtail)
1559 goto error;
1560 } else {
1561 /* we need rhead outside hwtail..hwcur */
1562 if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1563 goto error;
1564 /* two cases now: head <= hwtail or head >= hwcur */
1565 if (head <= kring->nr_hwtail) {
1566 /* want head <= cur <= hwtail */
1567 if (cur < head || cur > kring->nr_hwtail)
1568 goto error;
1569 } else {
1570 /* cur must be outside hwtail..head */
1571 if (cur < head && cur > kring->nr_hwtail)
1572 goto error;
1573 }
1574 }
1575 if (ring->tail != kring->rtail) {
1576 RD(5, "%s tail overwritten was %d need %d",
1577 kring->name,
1578 ring->tail, kring->rtail);
1579 ring->tail = kring->rtail;
1580 }
1581 return head;
1582
1583 error:
1584 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1585 kring->nr_hwcur,
1586 kring->rcur, kring->nr_hwtail,
1587 kring->rhead, kring->rcur, ring->tail);
1588 return n;
1589 }
1590
1591
1592 /*
1593 * Error routine called when txsync/rxsync detects an error.
1594 * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1595 * Return 1 on reinit.
1596 *
1597 * This routine is only called by the upper half of the kernel.
1598 * It only reads hwcur (which is changed only by the upper half, too)
1599 * and hwtail (which may be changed by the lower half, but only on
1600 * a tx ring and only to increase it, so any error will be recovered
1601 * on the next call). For the above, we don't strictly need to call
1602 * it under lock.
1603 */
1604 int
netmap_ring_reinit(struct netmap_kring * kring)1605 netmap_ring_reinit(struct netmap_kring *kring)
1606 {
1607 struct netmap_ring *ring = kring->ring;
1608 u_int i, lim = kring->nkr_num_slots - 1;
1609 int errors = 0;
1610
1611 // XXX KASSERT nm_kr_tryget
1612 RD(10, "called for %s", kring->name);
1613 // XXX probably wrong to trust userspace
1614 kring->rhead = ring->head;
1615 kring->rcur = ring->cur;
1616 kring->rtail = ring->tail;
1617
1618 if (ring->cur > lim)
1619 errors++;
1620 if (ring->head > lim)
1621 errors++;
1622 if (ring->tail > lim)
1623 errors++;
1624 for (i = 0; i <= lim; i++) {
1625 u_int idx = ring->slot[i].buf_idx;
1626 u_int len = ring->slot[i].len;
1627 if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1628 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1629 ring->slot[i].buf_idx = 0;
1630 ring->slot[i].len = 0;
1631 } else if (len > NETMAP_BUF_SIZE(kring->na)) {
1632 ring->slot[i].len = 0;
1633 RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1634 }
1635 }
1636 if (errors) {
1637 RD(10, "total %d errors", errors);
1638 RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1639 kring->name,
1640 ring->cur, kring->nr_hwcur,
1641 ring->tail, kring->nr_hwtail);
1642 ring->head = kring->rhead = kring->nr_hwcur;
1643 ring->cur = kring->rcur = kring->nr_hwcur;
1644 ring->tail = kring->rtail = kring->nr_hwtail;
1645 }
1646 return (errors ? 1 : 0);
1647 }
1648
1649 /* interpret the ringid and flags fields of an nmreq, by translating them
1650 * into a pair of intervals of ring indices:
1651 *
1652 * [priv->np_txqfirst, priv->np_txqlast) and
1653 * [priv->np_rxqfirst, priv->np_rxqlast)
1654 *
1655 */
1656 int
netmap_interp_ringid(struct netmap_priv_d * priv,uint16_t ringid,uint32_t flags)1657 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1658 {
1659 struct netmap_adapter *na = priv->np_na;
1660 u_int j, i = ringid & NETMAP_RING_MASK;
1661 u_int reg = flags & NR_REG_MASK;
1662 enum txrx t;
1663
1664 if (reg == NR_REG_DEFAULT) {
1665 /* convert from old ringid to flags */
1666 if (ringid & NETMAP_SW_RING) {
1667 reg = NR_REG_SW;
1668 } else if (ringid & NETMAP_HW_RING) {
1669 reg = NR_REG_ONE_NIC;
1670 } else {
1671 reg = NR_REG_ALL_NIC;
1672 }
1673 D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1674 }
1675 switch (reg) {
1676 case NR_REG_ALL_NIC:
1677 case NR_REG_PIPE_MASTER:
1678 case NR_REG_PIPE_SLAVE:
1679 for_rx_tx(t) {
1680 priv->np_qfirst[t] = 0;
1681 priv->np_qlast[t] = nma_get_nrings(na, t);
1682 }
1683 ND("%s %d %d", "ALL/PIPE",
1684 priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
1685 break;
1686 case NR_REG_SW:
1687 case NR_REG_NIC_SW:
1688 if (!(na->na_flags & NAF_HOST_RINGS)) {
1689 D("host rings not supported");
1690 return EINVAL;
1691 }
1692 for_rx_tx(t) {
1693 priv->np_qfirst[t] = (reg == NR_REG_SW ?
1694 nma_get_nrings(na, t) : 0);
1695 priv->np_qlast[t] = nma_get_nrings(na, t) + 1;
1696 }
1697 ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1698 priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
1699 break;
1700 case NR_REG_ONE_NIC:
1701 if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1702 D("invalid ring id %d", i);
1703 return EINVAL;
1704 }
1705 for_rx_tx(t) {
1706 /* if not enough rings, use the first one */
1707 j = i;
1708 if (j >= nma_get_nrings(na, t))
1709 j = 0;
1710 priv->np_qfirst[t] = j;
1711 priv->np_qlast[t] = j + 1;
1712 }
1713 break;
1714 default:
1715 D("invalid regif type %d", reg);
1716 return EINVAL;
1717 }
1718 priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1719
1720 if (netmap_verbose) {
1721 D("%s: tx [%d,%d) rx [%d,%d) id %d",
1722 na->name,
1723 priv->np_qfirst[NR_TX],
1724 priv->np_qlast[NR_TX],
1725 priv->np_qfirst[NR_RX],
1726 priv->np_qlast[NR_RX],
1727 i);
1728 }
1729 return 0;
1730 }
1731
1732
1733 /*
1734 * Set the ring ID. For devices with a single queue, a request
1735 * for all rings is the same as a single ring.
1736 */
1737 static int
netmap_set_ringid(struct netmap_priv_d * priv,uint16_t ringid,uint32_t flags)1738 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1739 {
1740 struct netmap_adapter *na = priv->np_na;
1741 int error;
1742 enum txrx t;
1743
1744 error = netmap_interp_ringid(priv, ringid, flags);
1745 if (error) {
1746 return error;
1747 }
1748
1749 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1750
1751 /* optimization: count the users registered for more than
1752 * one ring, which are the ones sleeping on the global queue.
1753 * The default netmap_notify() callback will then
1754 * avoid signaling the global queue if nobody is using it
1755 */
1756 for_rx_tx(t) {
1757 if (nm_si_user(priv, t))
1758 na->si_users[t]++;
1759 }
1760 return 0;
1761 }
1762
1763 static void
netmap_unset_ringid(struct netmap_priv_d * priv)1764 netmap_unset_ringid(struct netmap_priv_d *priv)
1765 {
1766 struct netmap_adapter *na = priv->np_na;
1767 enum txrx t;
1768
1769 for_rx_tx(t) {
1770 if (nm_si_user(priv, t))
1771 na->si_users[t]--;
1772 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1773 }
1774 priv->np_flags = 0;
1775 priv->np_txpoll = 0;
1776 }
1777
1778
1779 /* check that the rings we want to bind are not exclusively owned by a previous
1780 * bind. If exclusive ownership has been requested, we also mark the rings.
1781 */
1782 static int
netmap_get_exclusive(struct netmap_priv_d * priv)1783 netmap_get_exclusive(struct netmap_priv_d *priv)
1784 {
1785 struct netmap_adapter *na = priv->np_na;
1786 u_int i;
1787 struct netmap_kring *kring;
1788 int excl = (priv->np_flags & NR_EXCLUSIVE);
1789 enum txrx t;
1790
1791 ND("%s: grabbing tx [%d, %d) rx [%d, %d)",
1792 na->name,
1793 priv->np_qfirst[NR_TX],
1794 priv->np_qlast[NR_TX],
1795 priv->np_qfirst[NR_RX],
1796 priv->np_qlast[NR_RX]);
1797
1798 /* first round: check that all the requested rings
1799 * are neither alread exclusively owned, nor we
1800 * want exclusive ownership when they are already in use
1801 */
1802 for_rx_tx(t) {
1803 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1804 kring = &NMR(na, t)[i];
1805 if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1806 (kring->users && excl))
1807 {
1808 ND("ring %s busy", kring->name);
1809 return EBUSY;
1810 }
1811 }
1812 }
1813
1814 /* second round: increment usage cound and possibly
1815 * mark as exclusive
1816 */
1817
1818 for_rx_tx(t) {
1819 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1820 kring = &NMR(na, t)[i];
1821 kring->users++;
1822 if (excl)
1823 kring->nr_kflags |= NKR_EXCLUSIVE;
1824 }
1825 }
1826
1827 return 0;
1828
1829 }
1830
1831 /* undo netmap_get_ownership() */
1832 static void
netmap_rel_exclusive(struct netmap_priv_d * priv)1833 netmap_rel_exclusive(struct netmap_priv_d *priv)
1834 {
1835 struct netmap_adapter *na = priv->np_na;
1836 u_int i;
1837 struct netmap_kring *kring;
1838 int excl = (priv->np_flags & NR_EXCLUSIVE);
1839 enum txrx t;
1840
1841 ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1842 na->name,
1843 priv->np_qfirst[NR_TX],
1844 priv->np_qlast[NR_TX],
1845 priv->np_qfirst[NR_RX],
1846 priv->np_qlast[MR_RX]);
1847
1848
1849 for_rx_tx(t) {
1850 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1851 kring = &NMR(na, t)[i];
1852 if (excl)
1853 kring->nr_kflags &= ~NKR_EXCLUSIVE;
1854 kring->users--;
1855 }
1856 }
1857 }
1858
1859 /*
1860 * possibly move the interface to netmap-mode.
1861 * If success it returns a pointer to netmap_if, otherwise NULL.
1862 * This must be called with NMG_LOCK held.
1863 *
1864 * The following na callbacks are called in the process:
1865 *
1866 * na->nm_config() [by netmap_update_config]
1867 * (get current number and size of rings)
1868 *
1869 * We have a generic one for linux (netmap_linux_config).
1870 * The bwrap has to override this, since it has to forward
1871 * the request to the wrapped adapter (netmap_bwrap_config).
1872 *
1873 *
1874 * na->nm_krings_create()
1875 * (create and init the krings array)
1876 *
1877 * One of the following:
1878 *
1879 * * netmap_hw_krings_create, (hw ports)
1880 * creates the standard layout for the krings
1881 * and adds the mbq (used for the host rings).
1882 *
1883 * * netmap_vp_krings_create (VALE ports)
1884 * add leases and scratchpads
1885 *
1886 * * netmap_pipe_krings_create (pipes)
1887 * create the krings and rings of both ends and
1888 * cross-link them
1889 *
1890 * * netmap_monitor_krings_create (monitors)
1891 * avoid allocating the mbq
1892 *
1893 * * netmap_bwrap_krings_create (bwraps)
1894 * create both the brap krings array,
1895 * the krings array of the wrapped adapter, and
1896 * (if needed) the fake array for the host adapter
1897 *
1898 * na->nm_register(, 1)
1899 * (put the adapter in netmap mode)
1900 *
1901 * This may be one of the following:
1902 * (XXX these should be either all *_register or all *_reg 2014-03-15)
1903 *
1904 * * netmap_hw_register (hw ports)
1905 * checks that the ifp is still there, then calls
1906 * the hardware specific callback;
1907 *
1908 * * netmap_vp_reg (VALE ports)
1909 * If the port is connected to a bridge,
1910 * set the NAF_NETMAP_ON flag under the
1911 * bridge write lock.
1912 *
1913 * * netmap_pipe_reg (pipes)
1914 * inform the other pipe end that it is no
1915 * longer responsibile for the lifetime of this
1916 * pipe end
1917 *
1918 * * netmap_monitor_reg (monitors)
1919 * intercept the sync callbacks of the monitored
1920 * rings
1921 *
1922 * * netmap_bwrap_register (bwraps)
1923 * cross-link the bwrap and hwna rings,
1924 * forward the request to the hwna, override
1925 * the hwna notify callback (to get the frames
1926 * coming from outside go through the bridge).
1927 *
1928 *
1929 */
1930 int
netmap_do_regif(struct netmap_priv_d * priv,struct netmap_adapter * na,uint16_t ringid,uint32_t flags)1931 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1932 uint16_t ringid, uint32_t flags)
1933 {
1934 struct netmap_if *nifp = NULL;
1935 int error;
1936
1937 NMG_LOCK_ASSERT();
1938 /* ring configuration may have changed, fetch from the card */
1939 netmap_update_config(na);
1940 priv->np_na = na; /* store the reference */
1941 error = netmap_set_ringid(priv, ringid, flags);
1942 if (error)
1943 goto err;
1944 error = netmap_mem_finalize(na->nm_mem, na);
1945 if (error)
1946 goto err;
1947
1948 if (na->active_fds == 0) {
1949 /*
1950 * If this is the first registration of the adapter,
1951 * also create the netmap rings and their in-kernel view,
1952 * the netmap krings.
1953 */
1954
1955 /*
1956 * Depending on the adapter, this may also create
1957 * the netmap rings themselves
1958 */
1959 error = na->nm_krings_create(na);
1960 if (error)
1961 goto err_drop_mem;
1962
1963 /* create all missing netmap rings */
1964 error = netmap_mem_rings_create(na);
1965 if (error)
1966 goto err_del_krings;
1967 }
1968
1969 /* now the kring must exist and we can check whether some
1970 * previous bind has exclusive ownership on them
1971 */
1972 error = netmap_get_exclusive(priv);
1973 if (error)
1974 goto err_del_rings;
1975
1976 /* in all cases, create a new netmap if */
1977 nifp = netmap_mem_if_new(na);
1978 if (nifp == NULL) {
1979 error = ENOMEM;
1980 goto err_rel_excl;
1981 }
1982
1983 na->active_fds++;
1984 if (!nm_netmap_on(na)) {
1985 /* Netmap not active, set the card in netmap mode
1986 * and make it use the shared buffers.
1987 */
1988 /* cache the allocator info in the na */
1989 netmap_mem_get_lut(na->nm_mem, &na->na_lut);
1990 ND("%p->na_lut == %p", na, na->na_lut.lut);
1991 error = na->nm_register(na, 1); /* mode on */
1992 if (error)
1993 goto err_del_if;
1994 }
1995
1996 /*
1997 * advertise that the interface is ready by setting np_nifp.
1998 * The barrier is needed because readers (poll, *SYNC and mmap)
1999 * check for priv->np_nifp != NULL without locking
2000 */
2001 mb(); /* make sure previous writes are visible to all CPUs */
2002 priv->np_nifp = nifp;
2003
2004 return 0;
2005
2006 err_del_if:
2007 memset(&na->na_lut, 0, sizeof(na->na_lut));
2008 na->active_fds--;
2009 netmap_mem_if_delete(na, nifp);
2010 err_rel_excl:
2011 netmap_rel_exclusive(priv);
2012 err_del_rings:
2013 if (na->active_fds == 0)
2014 netmap_mem_rings_delete(na);
2015 err_del_krings:
2016 if (na->active_fds == 0)
2017 na->nm_krings_delete(na);
2018 err_drop_mem:
2019 netmap_mem_deref(na->nm_mem, na);
2020 err:
2021 priv->np_na = NULL;
2022 return error;
2023 }
2024
2025
2026 /*
2027 * update kring and ring at the end of txsync.
2028 */
2029 static inline void
nm_txsync_finalize(struct netmap_kring * kring)2030 nm_txsync_finalize(struct netmap_kring *kring)
2031 {
2032 /* update ring tail to what the kernel knows */
2033 kring->ring->tail = kring->rtail = kring->nr_hwtail;
2034
2035 /* note, head/rhead/hwcur might be behind cur/rcur
2036 * if no carrier
2037 */
2038 ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2039 kring->name, kring->nr_hwcur, kring->nr_hwtail,
2040 kring->rhead, kring->rcur, kring->rtail);
2041 }
2042
2043
2044 /*
2045 * update kring and ring at the end of rxsync
2046 */
2047 static inline void
nm_rxsync_finalize(struct netmap_kring * kring)2048 nm_rxsync_finalize(struct netmap_kring *kring)
2049 {
2050 /* tell userspace that there might be new packets */
2051 //struct netmap_ring *ring = kring->ring;
2052 ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail,
2053 kring->nr_hwtail);
2054 kring->ring->tail = kring->rtail = kring->nr_hwtail;
2055 /* make a copy of the state for next round */
2056 kring->rhead = kring->ring->head;
2057 kring->rcur = kring->ring->cur;
2058 }
2059
2060
2061
2062 /*
2063 * ioctl(2) support for the "netmap" device.
2064 *
2065 * Following a list of accepted commands:
2066 * - NIOCGINFO
2067 * - SIOCGIFADDR just for convenience
2068 * - NIOCREGIF
2069 * - NIOCTXSYNC
2070 * - NIOCRXSYNC
2071 *
2072 * Return 0 on success, errno otherwise.
2073 */
2074 int
netmap_ioctl(struct cdev * dev,u_long cmd,caddr_t data,int fflag,struct thread * td)2075 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2076 int fflag, struct thread *td)
2077 {
2078 struct netmap_priv_d *priv = NULL;
2079 struct nmreq *nmr = (struct nmreq *) data;
2080 struct netmap_adapter *na = NULL;
2081 int error;
2082 u_int i, qfirst, qlast;
2083 struct netmap_if *nifp;
2084 struct netmap_kring *krings;
2085 enum txrx t;
2086
2087 (void)dev; /* UNUSED */
2088 (void)fflag; /* UNUSED */
2089
2090 if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2091 /* truncate name */
2092 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2093 if (nmr->nr_version != NETMAP_API) {
2094 D("API mismatch for %s got %d need %d",
2095 nmr->nr_name,
2096 nmr->nr_version, NETMAP_API);
2097 nmr->nr_version = NETMAP_API;
2098 }
2099 if (nmr->nr_version < NETMAP_MIN_API ||
2100 nmr->nr_version > NETMAP_MAX_API) {
2101 return EINVAL;
2102 }
2103 }
2104 CURVNET_SET(TD_TO_VNET(td));
2105
2106 error = devfs_get_cdevpriv((void **)&priv);
2107 if (error) {
2108 CURVNET_RESTORE();
2109 /* XXX ENOENT should be impossible, since the priv
2110 * is now created in the open */
2111 return (error == ENOENT ? ENXIO : error);
2112 }
2113
2114 switch (cmd) {
2115 case NIOCGINFO: /* return capabilities etc */
2116 if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2117 error = netmap_bdg_ctl(nmr, NULL);
2118 break;
2119 }
2120
2121 NMG_LOCK();
2122 do {
2123 /* memsize is always valid */
2124 struct netmap_mem_d *nmd = &nm_mem;
2125 u_int memflags;
2126
2127 if (nmr->nr_name[0] != '\0') {
2128 /* get a refcount */
2129 error = netmap_get_na(nmr, &na, 1 /* create */);
2130 if (error)
2131 break;
2132 nmd = na->nm_mem; /* get memory allocator */
2133 }
2134
2135 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2136 &nmr->nr_arg2);
2137 if (error)
2138 break;
2139 if (na == NULL) /* only memory info */
2140 break;
2141 nmr->nr_offset = 0;
2142 nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2143 netmap_update_config(na);
2144 nmr->nr_rx_rings = na->num_rx_rings;
2145 nmr->nr_tx_rings = na->num_tx_rings;
2146 nmr->nr_rx_slots = na->num_rx_desc;
2147 nmr->nr_tx_slots = na->num_tx_desc;
2148 netmap_adapter_put(na);
2149 } while (0);
2150 NMG_UNLOCK();
2151 break;
2152
2153 case NIOCREGIF:
2154 /* possibly attach/detach NIC and VALE switch */
2155 i = nmr->nr_cmd;
2156 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2157 || i == NETMAP_BDG_VNET_HDR
2158 || i == NETMAP_BDG_NEWIF
2159 || i == NETMAP_BDG_DELIF) {
2160 error = netmap_bdg_ctl(nmr, NULL);
2161 break;
2162 } else if (i != 0) {
2163 D("nr_cmd must be 0 not %d", i);
2164 error = EINVAL;
2165 break;
2166 }
2167
2168 /* protect access to priv from concurrent NIOCREGIF */
2169 NMG_LOCK();
2170 do {
2171 u_int memflags;
2172
2173 if (priv->np_nifp != NULL) { /* thread already registered */
2174 error = EBUSY;
2175 break;
2176 }
2177 /* find the interface and a reference */
2178 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2179 if (error)
2180 break;
2181 if (NETMAP_OWNED_BY_KERN(na)) {
2182 netmap_adapter_put(na);
2183 error = EBUSY;
2184 break;
2185 }
2186 error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);
2187 if (error) { /* reg. failed, release priv and ref */
2188 netmap_adapter_put(na);
2189 break;
2190 }
2191 nifp = priv->np_nifp;
2192 priv->np_td = td; // XXX kqueue, debugging only
2193
2194 /* return the offset of the netmap_if object */
2195 nmr->nr_rx_rings = na->num_rx_rings;
2196 nmr->nr_tx_rings = na->num_tx_rings;
2197 nmr->nr_rx_slots = na->num_rx_desc;
2198 nmr->nr_tx_slots = na->num_tx_desc;
2199 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2200 &nmr->nr_arg2);
2201 if (error) {
2202 netmap_do_unregif(priv);
2203 netmap_adapter_put(na);
2204 break;
2205 }
2206 if (memflags & NETMAP_MEM_PRIVATE) {
2207 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2208 }
2209 for_rx_tx(t) {
2210 priv->np_si[t] = nm_si_user(priv, t) ?
2211 &na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si;
2212 }
2213
2214 if (nmr->nr_arg3) {
2215 D("requested %d extra buffers", nmr->nr_arg3);
2216 nmr->nr_arg3 = netmap_extra_alloc(na,
2217 &nifp->ni_bufs_head, nmr->nr_arg3);
2218 D("got %d extra buffers", nmr->nr_arg3);
2219 }
2220 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2221 } while (0);
2222 NMG_UNLOCK();
2223 break;
2224
2225 case NIOCTXSYNC:
2226 case NIOCRXSYNC:
2227 nifp = priv->np_nifp;
2228
2229 if (nifp == NULL) {
2230 error = ENXIO;
2231 break;
2232 }
2233 mb(); /* make sure following reads are not from cache */
2234
2235 na = priv->np_na; /* we have a reference */
2236
2237 if (na == NULL) {
2238 D("Internal error: nifp != NULL && na == NULL");
2239 error = ENXIO;
2240 break;
2241 }
2242
2243 if (!nm_netmap_on(na)) {
2244 error = ENXIO;
2245 break;
2246 }
2247
2248 t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2249 krings = NMR(na, t);
2250 qfirst = priv->np_qfirst[t];
2251 qlast = priv->np_qlast[t];
2252
2253 for (i = qfirst; i < qlast; i++) {
2254 struct netmap_kring *kring = krings + i;
2255 if (nm_kr_tryget(kring)) {
2256 error = EBUSY;
2257 goto out;
2258 }
2259 if (cmd == NIOCTXSYNC) {
2260 if (netmap_verbose & NM_VERB_TXSYNC)
2261 D("pre txsync ring %d cur %d hwcur %d",
2262 i, kring->ring->cur,
2263 kring->nr_hwcur);
2264 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2265 netmap_ring_reinit(kring);
2266 } else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) {
2267 nm_txsync_finalize(kring);
2268 }
2269 if (netmap_verbose & NM_VERB_TXSYNC)
2270 D("post txsync ring %d cur %d hwcur %d",
2271 i, kring->ring->cur,
2272 kring->nr_hwcur);
2273 } else {
2274 if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
2275 netmap_ring_reinit(kring);
2276 } else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) {
2277 nm_rxsync_finalize(kring);
2278 }
2279 microtime(&na->rx_rings[i].ring->ts);
2280 }
2281 nm_kr_put(kring);
2282 }
2283
2284 break;
2285
2286 #ifdef WITH_VALE
2287 case NIOCCONFIG:
2288 error = netmap_bdg_config(nmr);
2289 break;
2290 #endif
2291 #ifdef __FreeBSD__
2292 case FIONBIO:
2293 case FIOASYNC:
2294 ND("FIONBIO/FIOASYNC are no-ops");
2295 break;
2296
2297 case BIOCIMMEDIATE:
2298 case BIOCGHDRCMPLT:
2299 case BIOCSHDRCMPLT:
2300 case BIOCSSEESENT:
2301 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2302 break;
2303
2304 default: /* allow device-specific ioctls */
2305 {
2306 struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2307 if (ifp == NULL) {
2308 error = ENXIO;
2309 } else {
2310 struct socket so;
2311
2312 bzero(&so, sizeof(so));
2313 so.so_vnet = ifp->if_vnet;
2314 // so->so_proto not null.
2315 error = ifioctl(&so, cmd, data, td);
2316 if_rele(ifp);
2317 }
2318 break;
2319 }
2320
2321 #else /* linux */
2322 default:
2323 error = EOPNOTSUPP;
2324 #endif /* linux */
2325 }
2326 out:
2327
2328 CURVNET_RESTORE();
2329 return (error);
2330 }
2331
2332
2333 /*
2334 * select(2) and poll(2) handlers for the "netmap" device.
2335 *
2336 * Can be called for one or more queues.
2337 * Return true the event mask corresponding to ready events.
2338 * If there are no ready events, do a selrecord on either individual
2339 * selinfo or on the global one.
2340 * Device-dependent parts (locking and sync of tx/rx rings)
2341 * are done through callbacks.
2342 *
2343 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2344 * The first one is remapped to pwait as selrecord() uses the name as an
2345 * hidden argument.
2346 */
2347 int
netmap_poll(struct cdev * dev,int events,struct thread * td)2348 netmap_poll(struct cdev *dev, int events, struct thread *td)
2349 {
2350 struct netmap_priv_d *priv = NULL;
2351 struct netmap_adapter *na;
2352 struct netmap_kring *kring;
2353 u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
2354 #define want_tx want[NR_TX]
2355 #define want_rx want[NR_RX]
2356 struct mbq q; /* packets from hw queues to host stack */
2357 void *pwait = dev; /* linux compatibility */
2358 int is_kevent = 0;
2359 enum txrx t;
2360
2361 /*
2362 * In order to avoid nested locks, we need to "double check"
2363 * txsync and rxsync if we decide to do a selrecord().
2364 * retry_tx (and retry_rx, later) prevent looping forever.
2365 */
2366 int retry_tx = 1, retry_rx = 1;
2367
2368 (void)pwait;
2369 mbq_init(&q);
2370
2371 /*
2372 * XXX kevent has curthread->tp_fop == NULL,
2373 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2374 * priv as the first argument, which is also useful to avoid
2375 * the selrecord() which are not necessary in that case.
2376 */
2377 if (devfs_get_cdevpriv((void **)&priv) != 0) {
2378 is_kevent = 1;
2379 if (netmap_verbose)
2380 D("called from kevent");
2381 priv = (struct netmap_priv_d *)dev;
2382 }
2383 if (priv == NULL)
2384 return POLLERR;
2385
2386 if (priv->np_nifp == NULL) {
2387 D("No if registered");
2388 return POLLERR;
2389 }
2390 mb(); /* make sure following reads are not from cache */
2391
2392 na = priv->np_na;
2393
2394 if (!nm_netmap_on(na))
2395 return POLLERR;
2396
2397 if (netmap_verbose & 0x8000)
2398 D("device %s events 0x%x", na->name, events);
2399 want_tx = events & (POLLOUT | POLLWRNORM);
2400 want_rx = events & (POLLIN | POLLRDNORM);
2401
2402
2403 /*
2404 * check_all_{tx|rx} are set if the card has more than one queue AND
2405 * the file descriptor is bound to all of them. If so, we sleep on
2406 * the "global" selinfo, otherwise we sleep on individual selinfo
2407 * (FreeBSD only allows two selinfo's per file descriptor).
2408 * The interrupt routine in the driver wake one or the other
2409 * (or both) depending on which clients are active.
2410 *
2411 * rxsync() is only called if we run out of buffers on a POLLIN.
2412 * txsync() is called if we run out of buffers on POLLOUT, or
2413 * there are pending packets to send. The latter can be disabled
2414 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2415 */
2416 check_all_tx = nm_si_user(priv, NR_TX);
2417 check_all_rx = nm_si_user(priv, NR_RX);
2418
2419 /*
2420 * We start with a lock free round which is cheap if we have
2421 * slots available. If this fails, then lock and call the sync
2422 * routines.
2423 */
2424 for_rx_tx(t) {
2425 for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2426 kring = &NMR(na, t)[i];
2427 /* XXX compare ring->cur and kring->tail */
2428 if (!nm_ring_empty(kring->ring)) {
2429 revents |= want[t];
2430 want[t] = 0; /* also breaks the loop */
2431 }
2432 }
2433 }
2434
2435 /*
2436 * If we want to push packets out (priv->np_txpoll) or
2437 * want_tx is still set, we must issue txsync calls
2438 * (on all rings, to avoid that the tx rings stall).
2439 * XXX should also check cur != hwcur on the tx rings.
2440 * Fortunately, normal tx mode has np_txpoll set.
2441 */
2442 if (priv->np_txpoll || want_tx) {
2443 /*
2444 * The first round checks if anyone is ready, if not
2445 * do a selrecord and another round to handle races.
2446 * want_tx goes to 0 if any space is found, and is
2447 * used to skip rings with no pending transmissions.
2448 */
2449 flush_tx:
2450 for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) {
2451 int found = 0;
2452
2453 kring = &na->tx_rings[i];
2454 if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2455 continue;
2456 /* only one thread does txsync */
2457 if (nm_kr_tryget(kring)) {
2458 /* either busy or stopped
2459 * XXX if the ring is stopped, sleeping would
2460 * be better. In current code, however, we only
2461 * stop the rings for brief intervals (2014-03-14)
2462 */
2463 if (netmap_verbose)
2464 RD(2, "%p lost race on txring %d, ok",
2465 priv, i);
2466 continue;
2467 }
2468 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2469 netmap_ring_reinit(kring);
2470 revents |= POLLERR;
2471 } else {
2472 if (kring->nm_sync(kring, 0))
2473 revents |= POLLERR;
2474 else
2475 nm_txsync_finalize(kring);
2476 }
2477
2478 /*
2479 * If we found new slots, notify potential
2480 * listeners on the same ring.
2481 * Since we just did a txsync, look at the copies
2482 * of cur,tail in the kring.
2483 */
2484 found = kring->rcur != kring->rtail;
2485 nm_kr_put(kring);
2486 if (found) { /* notify other listeners */
2487 revents |= want_tx;
2488 want_tx = 0;
2489 kring->nm_notify(kring, 0);
2490 }
2491 }
2492 if (want_tx && retry_tx && !is_kevent) {
2493 OS_selrecord(td, check_all_tx ?
2494 &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
2495 retry_tx = 0;
2496 goto flush_tx;
2497 }
2498 }
2499
2500 /*
2501 * If want_rx is still set scan receive rings.
2502 * Do it on all rings because otherwise we starve.
2503 */
2504 if (want_rx) {
2505 int send_down = 0; /* transparent mode */
2506 /* two rounds here for race avoidance */
2507 do_retry_rx:
2508 for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
2509 int found = 0;
2510
2511 kring = &na->rx_rings[i];
2512
2513 if (nm_kr_tryget(kring)) {
2514 if (netmap_verbose)
2515 RD(2, "%p lost race on rxring %d, ok",
2516 priv, i);
2517 continue;
2518 }
2519
2520 if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
2521 netmap_ring_reinit(kring);
2522 revents |= POLLERR;
2523 }
2524 /* now we can use kring->rcur, rtail */
2525
2526 /*
2527 * transparent mode support: collect packets
2528 * from the rxring(s).
2529 * XXX NR_FORWARD should only be read on
2530 * physical or NIC ports
2531 */
2532 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2533 ND(10, "forwarding some buffers up %d to %d",
2534 kring->nr_hwcur, kring->ring->cur);
2535 netmap_grab_packets(kring, &q, netmap_fwd);
2536 }
2537
2538 if (kring->nm_sync(kring, 0))
2539 revents |= POLLERR;
2540 else
2541 nm_rxsync_finalize(kring);
2542 if (netmap_no_timestamp == 0 ||
2543 kring->ring->flags & NR_TIMESTAMP) {
2544 microtime(&kring->ring->ts);
2545 }
2546 found = kring->rcur != kring->rtail;
2547 nm_kr_put(kring);
2548 if (found) {
2549 revents |= want_rx;
2550 retry_rx = 0;
2551 kring->nm_notify(kring, 0);
2552 }
2553 }
2554
2555 /* transparent mode XXX only during first pass ? */
2556 if (na->na_flags & NAF_HOST_RINGS) {
2557 kring = &na->rx_rings[na->num_rx_rings];
2558 if (check_all_rx
2559 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2560 /* XXX fix to use kring fields */
2561 if (nm_ring_empty(kring->ring))
2562 send_down = netmap_rxsync_from_host(na, td, dev);
2563 if (!nm_ring_empty(kring->ring))
2564 revents |= want_rx;
2565 }
2566 }
2567
2568 if (retry_rx && !is_kevent)
2569 OS_selrecord(td, check_all_rx ?
2570 &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
2571 if (send_down > 0 || retry_rx) {
2572 retry_rx = 0;
2573 if (send_down)
2574 goto flush_tx; /* and retry_rx */
2575 else
2576 goto do_retry_rx;
2577 }
2578 }
2579
2580 /*
2581 * Transparent mode: marked bufs on rx rings between
2582 * kring->nr_hwcur and ring->head
2583 * are passed to the other endpoint.
2584 *
2585 * In this mode we also scan the sw rxring, which in
2586 * turn passes packets up.
2587 *
2588 * XXX Transparent mode at the moment requires to bind all
2589 * rings to a single file descriptor.
2590 */
2591
2592 if (q.head && na->ifp != NULL)
2593 netmap_send_up(na->ifp, &q);
2594
2595 return (revents);
2596 #undef want_tx
2597 #undef want_rx
2598 }
2599
2600
2601 /*-------------------- driver support routines -------------------*/
2602
2603 static int netmap_hw_krings_create(struct netmap_adapter *);
2604
2605 /* default notify callback */
2606 static int
netmap_notify(struct netmap_kring * kring,int flags)2607 netmap_notify(struct netmap_kring *kring, int flags)
2608 {
2609 struct netmap_adapter *na = kring->na;
2610 enum txrx t = kring->tx;
2611
2612 OS_selwakeup(&kring->si, PI_NET);
2613 /* optimization: avoid a wake up on the global
2614 * queue if nobody has registered for more
2615 * than one ring
2616 */
2617 if (na->si_users[t] > 0)
2618 OS_selwakeup(&na->si[t], PI_NET);
2619
2620 return 0;
2621 }
2622
2623
2624 /* called by all routines that create netmap_adapters.
2625 * Attach na to the ifp (if any) and provide defaults
2626 * for optional callbacks. Defaults assume that we
2627 * are creating an hardware netmap_adapter.
2628 */
2629 int
netmap_attach_common(struct netmap_adapter * na)2630 netmap_attach_common(struct netmap_adapter *na)
2631 {
2632 struct ifnet *ifp = na->ifp;
2633
2634 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2635 D("%s: invalid rings tx %d rx %d",
2636 na->name, na->num_tx_rings, na->num_rx_rings);
2637 return EINVAL;
2638 }
2639 /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2640 * pipes, monitors). For bwrap we actually have a non-null ifp for
2641 * use by the external modules, but that is set after this
2642 * function has been called.
2643 * XXX this is ugly, maybe split this function in two (2014-03-14)
2644 */
2645 if (ifp != NULL) {
2646 WNA(ifp) = na;
2647
2648 /* the following is only needed for na that use the host port.
2649 * XXX do we have something similar for linux ?
2650 */
2651 #ifdef __FreeBSD__
2652 na->if_input = ifp->if_input; /* for netmap_send_up */
2653 #endif /* __FreeBSD__ */
2654
2655 NETMAP_SET_CAPABLE(ifp);
2656 }
2657 if (na->nm_krings_create == NULL) {
2658 /* we assume that we have been called by a driver,
2659 * since other port types all provide their own
2660 * nm_krings_create
2661 */
2662 na->nm_krings_create = netmap_hw_krings_create;
2663 na->nm_krings_delete = netmap_hw_krings_delete;
2664 }
2665 if (na->nm_notify == NULL)
2666 na->nm_notify = netmap_notify;
2667 na->active_fds = 0;
2668
2669 if (na->nm_mem == NULL)
2670 /* use the global allocator */
2671 na->nm_mem = &nm_mem;
2672 netmap_mem_get(na->nm_mem);
2673 #ifdef WITH_VALE
2674 if (na->nm_bdg_attach == NULL)
2675 /* no special nm_bdg_attach callback. On VALE
2676 * attach, we need to interpose a bwrap
2677 */
2678 na->nm_bdg_attach = netmap_bwrap_attach;
2679 #endif
2680 return 0;
2681 }
2682
2683
2684 /* standard cleanup, called by all destructors */
2685 void
netmap_detach_common(struct netmap_adapter * na)2686 netmap_detach_common(struct netmap_adapter *na)
2687 {
2688 if (na->ifp != NULL)
2689 WNA(na->ifp) = NULL; /* XXX do we need this? */
2690
2691 if (na->tx_rings) { /* XXX should not happen */
2692 D("freeing leftover tx_rings");
2693 na->nm_krings_delete(na);
2694 }
2695 netmap_pipe_dealloc(na);
2696 if (na->nm_mem)
2697 netmap_mem_put(na->nm_mem);
2698 bzero(na, sizeof(*na));
2699 free(na, M_NETMAP);
2700 }
2701
2702 /* Wrapper for the register callback provided hardware drivers.
2703 * na->ifp == NULL means the the driver module has been
2704 * unloaded, so we cannot call into it.
2705 * Note that module unloading, in our patched linux drivers,
2706 * happens under NMG_LOCK and after having stopped all the
2707 * nic rings (see netmap_detach). This provides sufficient
2708 * protection for the other driver-provied callbacks
2709 * (i.e., nm_config and nm_*xsync), that therefore don't need
2710 * to wrapped.
2711 */
2712 static int
netmap_hw_register(struct netmap_adapter * na,int onoff)2713 netmap_hw_register(struct netmap_adapter *na, int onoff)
2714 {
2715 struct netmap_hw_adapter *hwna =
2716 (struct netmap_hw_adapter*)na;
2717
2718 if (na->ifp == NULL)
2719 return onoff ? ENXIO : 0;
2720
2721 return hwna->nm_hw_register(na, onoff);
2722 }
2723
2724
2725 /*
2726 * Initialize a ``netmap_adapter`` object created by driver on attach.
2727 * We allocate a block of memory with room for a struct netmap_adapter
2728 * plus two sets of N+2 struct netmap_kring (where N is the number
2729 * of hardware rings):
2730 * krings 0..N-1 are for the hardware queues.
2731 * kring N is for the host stack queue
2732 * kring N+1 is only used for the selinfo for all queues. // XXX still true ?
2733 * Return 0 on success, ENOMEM otherwise.
2734 */
2735 int
netmap_attach(struct netmap_adapter * arg)2736 netmap_attach(struct netmap_adapter *arg)
2737 {
2738 struct netmap_hw_adapter *hwna = NULL;
2739 // XXX when is arg == NULL ?
2740 struct ifnet *ifp = arg ? arg->ifp : NULL;
2741
2742 if (arg == NULL || ifp == NULL)
2743 goto fail;
2744 hwna = malloc(sizeof(*hwna), M_NETMAP, M_NOWAIT | M_ZERO);
2745 if (hwna == NULL)
2746 goto fail;
2747 hwna->up = *arg;
2748 hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
2749 strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2750 hwna->nm_hw_register = hwna->up.nm_register;
2751 hwna->up.nm_register = netmap_hw_register;
2752 if (netmap_attach_common(&hwna->up)) {
2753 free(hwna, M_NETMAP);
2754 goto fail;
2755 }
2756 netmap_adapter_get(&hwna->up);
2757
2758 #ifdef linux
2759 if (ifp->netdev_ops) {
2760 /* prepare a clone of the netdev ops */
2761 #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS
2762 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2763 #else
2764 hwna->nm_ndo = *ifp->netdev_ops;
2765 #endif
2766 }
2767 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2768 if (ifp->ethtool_ops) {
2769 hwna->nm_eto = *ifp->ethtool_ops;
2770 }
2771 hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2772 #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS
2773 hwna->nm_eto.set_channels = linux_netmap_set_channels;
2774 #endif
2775 if (arg->nm_config == NULL) {
2776 hwna->up.nm_config = netmap_linux_config;
2777 }
2778 #endif /* linux */
2779
2780 if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
2781 hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2782 hwna->up.num_rx_rings, hwna->up.num_rx_desc);
2783 return 0;
2784
2785 fail:
2786 D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2787 if (ifp)
2788 netmap_detach(ifp);
2789 return (hwna ? EINVAL : ENOMEM);
2790 }
2791
2792
2793 void
NM_DBG(netmap_adapter_get)2794 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2795 {
2796 if (!na) {
2797 return;
2798 }
2799
2800 refcount_acquire(&na->na_refcount);
2801 }
2802
2803
2804 /* returns 1 iff the netmap_adapter is destroyed */
2805 int
NM_DBG(netmap_adapter_put)2806 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2807 {
2808 if (!na)
2809 return 1;
2810
2811 if (!refcount_release(&na->na_refcount))
2812 return 0;
2813
2814 if (na->nm_dtor)
2815 na->nm_dtor(na);
2816
2817 netmap_detach_common(na);
2818
2819 return 1;
2820 }
2821
2822 /* nm_krings_create callback for all hardware native adapters */
2823 int
netmap_hw_krings_create(struct netmap_adapter * na)2824 netmap_hw_krings_create(struct netmap_adapter *na)
2825 {
2826 int ret = netmap_krings_create(na, 0);
2827 if (ret == 0) {
2828 /* initialize the mbq for the sw rx ring */
2829 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2830 ND("initialized sw rx queue %d", na->num_rx_rings);
2831 }
2832 return ret;
2833 }
2834
2835
2836
2837 /*
2838 * Called on module unload by the netmap-enabled drivers
2839 */
2840 void
netmap_detach(struct ifnet * ifp)2841 netmap_detach(struct ifnet *ifp)
2842 {
2843 struct netmap_adapter *na = NA(ifp);
2844 int skip;
2845
2846 if (!na)
2847 return;
2848
2849 skip = 0;
2850 NMG_LOCK();
2851 netmap_disable_all_rings(ifp);
2852 na->ifp = NULL;
2853 na->na_flags &= ~NAF_NETMAP_ON;
2854 /*
2855 * if the netmap adapter is not native, somebody
2856 * changed it, so we can not release it here.
2857 * The NULL na->ifp will notify the new owner that
2858 * the driver is gone.
2859 */
2860 if (na->na_flags & NAF_NATIVE) {
2861 skip = netmap_adapter_put(na);
2862 }
2863 /* give them a chance to notice */
2864 if (skip == 0)
2865 netmap_enable_all_rings(ifp);
2866 NMG_UNLOCK();
2867 }
2868
2869
2870 /*
2871 * Intercept packets from the network stack and pass them
2872 * to netmap as incoming packets on the 'software' ring.
2873 *
2874 * We only store packets in a bounded mbq and then copy them
2875 * in the relevant rxsync routine.
2876 *
2877 * We rely on the OS to make sure that the ifp and na do not go
2878 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2879 * In nm_register() or whenever there is a reinitialization,
2880 * we make sure to make the mode change visible here.
2881 */
2882 int
netmap_transmit(struct ifnet * ifp,struct mbuf * m)2883 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2884 {
2885 struct netmap_adapter *na = NA(ifp);
2886 struct netmap_kring *kring;
2887 u_int len = MBUF_LEN(m);
2888 u_int error = ENOBUFS;
2889 struct mbq *q;
2890 int space;
2891
2892 kring = &na->rx_rings[na->num_rx_rings];
2893 // XXX [Linux] we do not need this lock
2894 // if we follow the down/configure/up protocol -gl
2895 // mtx_lock(&na->core_lock);
2896
2897 if (!nm_netmap_on(na)) {
2898 D("%s not in netmap mode anymore", na->name);
2899 error = ENXIO;
2900 goto done;
2901 }
2902
2903 q = &kring->rx_queue;
2904
2905 // XXX reconsider long packets if we handle fragments
2906 if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2907 D("%s from_host, drop packet size %d > %d", na->name,
2908 len, NETMAP_BUF_SIZE(na));
2909 goto done;
2910 }
2911
2912 /* protect against rxsync_from_host(), netmap_sw_to_nic()
2913 * and maybe other instances of netmap_transmit (the latter
2914 * not possible on Linux).
2915 * Also avoid overflowing the queue.
2916 */
2917 mbq_lock(q);
2918
2919 space = kring->nr_hwtail - kring->nr_hwcur;
2920 if (space < 0)
2921 space += kring->nkr_num_slots;
2922 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2923 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2924 na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2925 len, m);
2926 } else {
2927 mbq_enqueue(q, m);
2928 ND(10, "%s %d bufs in queue len %d m %p",
2929 na->name, mbq_len(q), len, m);
2930 /* notify outside the lock */
2931 m = NULL;
2932 error = 0;
2933 }
2934 mbq_unlock(q);
2935
2936 done:
2937 if (m)
2938 m_freem(m);
2939 /* unconditionally wake up listeners */
2940 kring->nm_notify(kring, 0);
2941 /* this is normally netmap_notify(), but for nics
2942 * connected to a bridge it is netmap_bwrap_intr_notify(),
2943 * that possibly forwards the frames through the switch
2944 */
2945
2946 return (error);
2947 }
2948
2949
2950 /*
2951 * netmap_reset() is called by the driver routines when reinitializing
2952 * a ring. The driver is in charge of locking to protect the kring.
2953 * If native netmap mode is not set just return NULL.
2954 */
2955 struct netmap_slot *
netmap_reset(struct netmap_adapter * na,enum txrx tx,u_int n,u_int new_cur)2956 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2957 u_int new_cur)
2958 {
2959 struct netmap_kring *kring;
2960 int new_hwofs, lim;
2961
2962 if (!nm_native_on(na)) {
2963 ND("interface not in native netmap mode");
2964 return NULL; /* nothing to reinitialize */
2965 }
2966
2967 /* XXX note- in the new scheme, we are not guaranteed to be
2968 * under lock (e.g. when called on a device reset).
2969 * In this case, we should set a flag and do not trust too
2970 * much the values. In practice: TODO
2971 * - set a RESET flag somewhere in the kring
2972 * - do the processing in a conservative way
2973 * - let the *sync() fixup at the end.
2974 */
2975 if (tx == NR_TX) {
2976 if (n >= na->num_tx_rings)
2977 return NULL;
2978 kring = na->tx_rings + n;
2979 // XXX check whether we should use hwcur or rcur
2980 new_hwofs = kring->nr_hwcur - new_cur;
2981 } else {
2982 if (n >= na->num_rx_rings)
2983 return NULL;
2984 kring = na->rx_rings + n;
2985 new_hwofs = kring->nr_hwtail - new_cur;
2986 }
2987 lim = kring->nkr_num_slots - 1;
2988 if (new_hwofs > lim)
2989 new_hwofs -= lim + 1;
2990
2991 /* Always set the new offset value and realign the ring. */
2992 if (netmap_verbose)
2993 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2994 na->name,
2995 tx == NR_TX ? "TX" : "RX", n,
2996 kring->nkr_hwofs, new_hwofs,
2997 kring->nr_hwtail,
2998 tx == NR_TX ? lim : kring->nr_hwtail);
2999 kring->nkr_hwofs = new_hwofs;
3000 if (tx == NR_TX) {
3001 kring->nr_hwtail = kring->nr_hwcur + lim;
3002 if (kring->nr_hwtail > lim)
3003 kring->nr_hwtail -= lim + 1;
3004 }
3005
3006 #if 0 // def linux
3007 /* XXX check that the mappings are correct */
3008 /* need ring_nr, adapter->pdev, direction */
3009 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3010 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3011 D("error mapping rx netmap buffer %d", i);
3012 // XXX fix error handling
3013 }
3014
3015 #endif /* linux */
3016 /*
3017 * Wakeup on the individual and global selwait
3018 * We do the wakeup here, but the ring is not yet reconfigured.
3019 * However, we are under lock so there are no races.
3020 */
3021 kring->nm_notify(kring, 0);
3022 return kring->ring->slot;
3023 }
3024
3025
3026 /*
3027 * Dispatch rx/tx interrupts to the netmap rings.
3028 *
3029 * "work_done" is non-null on the RX path, NULL for the TX path.
3030 * We rely on the OS to make sure that there is only one active
3031 * instance per queue, and that there is appropriate locking.
3032 *
3033 * The 'notify' routine depends on what the ring is attached to.
3034 * - for a netmap file descriptor, do a selwakeup on the individual
3035 * waitqueue, plus one on the global one if needed
3036 * (see netmap_notify)
3037 * - for a nic connected to a switch, call the proper forwarding routine
3038 * (see netmap_bwrap_intr_notify)
3039 */
3040 void
netmap_common_irq(struct ifnet * ifp,u_int q,u_int * work_done)3041 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3042 {
3043 struct netmap_adapter *na = NA(ifp);
3044 struct netmap_kring *kring;
3045 enum txrx t = (work_done ? NR_RX : NR_TX);
3046
3047 q &= NETMAP_RING_MASK;
3048
3049 if (netmap_verbose) {
3050 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3051 }
3052
3053 if (q >= nma_get_nrings(na, t))
3054 return; // not a physical queue
3055
3056 kring = NMR(na, t) + q;
3057
3058 if (t == NR_RX) {
3059 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ?
3060 *work_done = 1; /* do not fire napi again */
3061 }
3062 kring->nm_notify(kring, 0);
3063 }
3064
3065
3066 /*
3067 * Default functions to handle rx/tx interrupts from a physical device.
3068 * "work_done" is non-null on the RX path, NULL for the TX path.
3069 *
3070 * If the card is not in netmap mode, simply return 0,
3071 * so that the caller proceeds with regular processing.
3072 * Otherwise call netmap_common_irq() and return 1.
3073 *
3074 * If the card is connected to a netmap file descriptor,
3075 * do a selwakeup on the individual queue, plus one on the global one
3076 * if needed (multiqueue card _and_ there are multiqueue listeners),
3077 * and return 1.
3078 *
3079 * Finally, if called on rx from an interface connected to a switch,
3080 * calls the proper forwarding routine, and return 1.
3081 */
3082 int
netmap_rx_irq(struct ifnet * ifp,u_int q,u_int * work_done)3083 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3084 {
3085 struct netmap_adapter *na = NA(ifp);
3086
3087 /*
3088 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3089 * we still use the regular driver even though the previous
3090 * check fails. It is unclear whether we should use
3091 * nm_native_on() here.
3092 */
3093 if (!nm_netmap_on(na))
3094 return 0;
3095
3096 if (na->na_flags & NAF_SKIP_INTR) {
3097 ND("use regular interrupt");
3098 return 0;
3099 }
3100
3101 netmap_common_irq(ifp, q, work_done);
3102 return 1;
3103 }
3104
3105
3106 /*
3107 * Module loader and unloader
3108 *
3109 * netmap_init() creates the /dev/netmap device and initializes
3110 * all global variables. Returns 0 on success, errno on failure
3111 * (but there is no chance)
3112 *
3113 * netmap_fini() destroys everything.
3114 */
3115
3116 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3117 extern struct cdevsw netmap_cdevsw;
3118
3119
3120 void
netmap_fini(void)3121 netmap_fini(void)
3122 {
3123 netmap_uninit_bridges();
3124 if (netmap_dev)
3125 destroy_dev(netmap_dev);
3126 netmap_mem_fini();
3127 NMG_LOCK_DESTROY();
3128 printf("netmap: unloaded module.\n");
3129 }
3130
3131
3132 int
netmap_init(void)3133 netmap_init(void)
3134 {
3135 int error;
3136
3137 NMG_LOCK_INIT();
3138
3139 error = netmap_mem_init();
3140 if (error != 0)
3141 goto fail;
3142 /*
3143 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3144 * when the module is compiled in.
3145 * XXX could use make_dev_credv() to get error number
3146 */
3147 netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3148 &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3149 "netmap");
3150 if (!netmap_dev)
3151 goto fail;
3152
3153 error = netmap_init_bridges();
3154 if (error)
3155 goto fail;
3156
3157 #ifdef __FreeBSD__
3158 nm_vi_init_index();
3159 #endif
3160
3161 printf("netmap: loaded module\n");
3162 return (0);
3163 fail:
3164 netmap_fini();
3165 return (EINVAL); /* may be incorrect */
3166 }
3167