1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (C) 2011-2014 Matteo Landi
5 * Copyright (C) 2011-2016 Luigi Rizzo
6 * Copyright (C) 2011-2016 Giuseppe Lettieri
7 * Copyright (C) 2011-2016 Vincenzo Maffione
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32
33 /*
34 * $FreeBSD: stable/12/sys/dev/netmap/netmap.c 372828 2022-12-31 12:30:25Z vmaffione $
35 *
36 * This module supports memory mapped access to network devices,
37 * see netmap(4).
38 *
39 * The module uses a large, memory pool allocated by the kernel
40 * and accessible as mmapped memory by multiple userspace threads/processes.
41 * The memory pool contains packet buffers and "netmap rings",
42 * i.e. user-accessible copies of the interface's queues.
43 *
44 * Access to the network card works like this:
45 * 1. a process/thread issues one or more open() on /dev/netmap, to create
46 * select()able file descriptor on which events are reported.
47 * 2. on each descriptor, the process issues an ioctl() to identify
48 * the interface that should report events to the file descriptor.
49 * 3. on each descriptor, the process issues an mmap() request to
50 * map the shared memory region within the process' address space.
51 * The list of interesting queues is indicated by a location in
52 * the shared memory region.
53 * 4. using the functions in the netmap(4) userspace API, a process
54 * can look up the occupation state of a queue, access memory buffers,
55 * and retrieve received packets or enqueue packets to transmit.
56 * 5. using some ioctl()s the process can synchronize the userspace view
57 * of the queue with the actual status in the kernel. This includes both
58 * receiving the notification of new packets, and transmitting new
59 * packets on the output interface.
60 * 6. select() or poll() can be used to wait for events on individual
61 * transmit or receive queues (or all queues for a given interface).
62 *
63
64 SYNCHRONIZATION (USER)
65
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73
74 LOCKING (INTERNAL)
75
76 Within the kernel, access to the netmap rings is protected as follows:
77
78 - a spinlock on each ring, to handle producer/consumer races on
79 RX rings attached to the host stack (against multiple host
80 threads writing from the host stack to the same ring),
81 and on 'destination' rings attached to a VALE switch
82 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83 protecting multiple active senders for the same destination)
84
85 - an atomic variable to guarantee that there is at most one
86 instance of *_*xsync() on the ring at any time.
87 For rings connected to user file
88 descriptors, an atomic_test_and_set() protects this, and the
89 lock on the ring is not actually used.
90 For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91 is also used to prevent multiple executions (the driver might indeed
92 already guarantee this).
93 For NIC TX rings connected to a VALE switch, the lock arbitrates
94 access to the queue (both when allocating buffers and when pushing
95 them out).
96
97 - *xsync() should be protected against initializations of the card.
98 On FreeBSD most devices have the reset routine protected by
99 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100 the RING protection on rx_reset(), this should be added.
101
102 On linux there is an external lock on the tx path, which probably
103 also arbitrates access to the reset routine. XXX to be revised
104
105 - a per-interface core_lock protecting access from the host stack
106 while interfaces may be detached from netmap mode.
107 XXX there should be no need for this lock if we detach the interfaces
108 only while they are down.
109
110
111 --- VALE SWITCH ---
112
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130
131 */
132
133
134 /* --- internals ----
135 *
136 * Roadmap to the code that implements the above.
137 *
138 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139 * > select()able file descriptor on which events are reported.
140 *
141 * Internally, we allocate a netmap_priv_d structure, that will be
142 * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143 * structure for each open().
144 *
145 * os-specific:
146 * FreeBSD: see netmap_open() (netmap_freebsd.c)
147 * linux: see linux_netmap_open() (netmap_linux.c)
148 *
149 * > 2. on each descriptor, the process issues an ioctl() to identify
150 * > the interface that should report events to the file descriptor.
151 *
152 * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153 * Most important things happen in netmap_get_na() and
154 * netmap_do_regif(), called from there. Additional details can be
155 * found in the comments above those functions.
156 *
157 * In all cases, this action creates/takes-a-reference-to a
158 * netmap_*_adapter describing the port, and allocates a netmap_if
159 * and all necessary netmap rings, filling them with netmap buffers.
160 *
161 * In this phase, the sync callbacks for each ring are set (these are used
162 * in steps 5 and 6 below). The callbacks depend on the type of adapter.
163 * The adapter creation/initialization code puts them in the
164 * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they
165 * are copied from there to the netmap_kring's during netmap_do_regif(), by
166 * the nm_krings_create() callback. All the nm_krings_create callbacks
167 * actually call netmap_krings_create() to perform this and the other
168 * common stuff. netmap_krings_create() also takes care of the host rings,
169 * if needed, by setting their sync callbacks appropriately.
170 *
171 * Additional actions depend on the kind of netmap_adapter that has been
172 * registered:
173 *
174 * - netmap_hw_adapter: [netmap.c]
175 * This is a system netdev/ifp with native netmap support.
176 * The ifp is detached from the host stack by redirecting:
177 * - transmissions (from the network stack) to netmap_transmit()
178 * - receive notifications to the nm_notify() callback for
179 * this adapter. The callback is normally netmap_notify(), unless
180 * the ifp is attached to a bridge using bwrap, in which case it
181 * is netmap_bwrap_intr_notify().
182 *
183 * - netmap_generic_adapter: [netmap_generic.c]
184 * A system netdev/ifp without native netmap support.
185 *
186 * (the decision about native/non native support is taken in
187 * netmap_get_hw_na(), called by netmap_get_na())
188 *
189 * - netmap_vp_adapter [netmap_vale.c]
190 * Returned by netmap_get_bdg_na().
191 * This is a persistent or ephemeral VALE port. Ephemeral ports
192 * are created on the fly if they don't already exist, and are
193 * always attached to a bridge.
194 * Persistent VALE ports must must be created separately, and i
195 * then attached like normal NICs. The NIOCREGIF we are examining
196 * will find them only if they had previosly been created and
197 * attached (see VALE_CTL below).
198 *
199 * - netmap_pipe_adapter [netmap_pipe.c]
200 * Returned by netmap_get_pipe_na().
201 * Both pipe ends are created, if they didn't already exist.
202 *
203 * - netmap_monitor_adapter [netmap_monitor.c]
204 * Returned by netmap_get_monitor_na().
205 * If successful, the nm_sync callbacks of the monitored adapter
206 * will be intercepted by the returned monitor.
207 *
208 * - netmap_bwrap_adapter [netmap_vale.c]
209 * Cannot be obtained in this way, see VALE_CTL below
210 *
211 *
212 * os-specific:
213 * linux: we first go through linux_netmap_ioctl() to
214 * adapt the FreeBSD interface to the linux one.
215 *
216 *
217 * > 3. on each descriptor, the process issues an mmap() request to
218 * > map the shared memory region within the process' address space.
219 * > The list of interesting queues is indicated by a location in
220 * > the shared memory region.
221 *
222 * os-specific:
223 * FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224 * linux: linux_netmap_mmap (netmap_linux.c).
225 *
226 * > 4. using the functions in the netmap(4) userspace API, a process
227 * > can look up the occupation state of a queue, access memory buffers,
228 * > and retrieve received packets or enqueue packets to transmit.
229 *
230 * these actions do not involve the kernel.
231 *
232 * > 5. using some ioctl()s the process can synchronize the userspace view
233 * > of the queue with the actual status in the kernel. This includes both
234 * > receiving the notification of new packets, and transmitting new
235 * > packets on the output interface.
236 *
237 * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238 * cases. They invoke the nm_sync callbacks on the netmap_kring
239 * structures, as initialized in step 2 and maybe later modified
240 * by a monitor. Monitors, however, will always call the original
241 * callback before doing anything else.
242 *
243 *
244 * > 6. select() or poll() can be used to wait for events on individual
245 * > transmit or receive queues (or all queues for a given interface).
246 *
247 * Implemented in netmap_poll(). This will call the same nm_sync()
248 * callbacks as in step 5 above.
249 *
250 * os-specific:
251 * linux: we first go through linux_netmap_poll() to adapt
252 * the FreeBSD interface to the linux one.
253 *
254 *
255 * ---- VALE_CTL -----
256 *
257 * VALE switches are controlled by issuing a NIOCREGIF with a non-null
258 * nr_cmd in the nmreq structure. These subcommands are handled by
259 * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260 * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261 * subcommands, respectively.
262 *
263 * Any network interface known to the system (including a persistent VALE
264 * port) can be attached to a VALE switch by issuing the
265 * NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266 * look exactly like ephemeral VALE ports (as created in step 2 above). The
267 * attachment of other interfaces, instead, requires the creation of a
268 * netmap_bwrap_adapter. Moreover, the attached interface must be put in
269 * netmap mode. This may require the creation of a netmap_generic_adapter if
270 * we have no native support for the interface, or if generic adapters have
271 * been forced by sysctl.
272 *
273 * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274 * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275 * callback. In the case of the bwrap, the callback creates the
276 * netmap_bwrap_adapter. The initialization of the bwrap is then
277 * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278 * callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279 * A generic adapter for the wrapped ifp will be created if needed, when
280 * netmap_get_bdg_na() calls netmap_get_hw_na().
281 *
282 *
283 * ---- DATAPATHS -----
284 *
285 * -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286 *
287 * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288 *
289 * - tx from netmap userspace:
290 * concurrently:
291 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292 * kring->nm_sync() == DEVICE_netmap_txsync()
293 * 2) device interrupt handler
294 * na->nm_notify() == netmap_notify()
295 * - rx from netmap userspace:
296 * concurrently:
297 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298 * kring->nm_sync() == DEVICE_netmap_rxsync()
299 * 2) device interrupt handler
300 * na->nm_notify() == netmap_notify()
301 * - rx from host stack
302 * concurrently:
303 * 1) host stack
304 * netmap_transmit()
305 * na->nm_notify == netmap_notify()
306 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307 * kring->nm_sync() == netmap_rxsync_from_host
308 * netmap_rxsync_from_host(na, NULL, NULL)
309 * - tx to host stack
310 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
311 * kring->nm_sync() == netmap_txsync_to_host
312 * netmap_txsync_to_host(na)
313 * nm_os_send_up()
314 * FreeBSD: na->if_input() == ether_input()
315 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316 *
317 *
318 * -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319 *
320 * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321 *
322 * - tx from netmap userspace:
323 * concurrently:
324 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325 * kring->nm_sync() == generic_netmap_txsync()
326 * nm_os_generic_xmit_frame()
327 * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328 * ifp->ndo_start_xmit == generic_ndo_start_xmit()
329 * gna->save_start_xmit == orig. dev. start_xmit
330 * FreeBSD: na->if_transmit() == orig. dev if_transmit
331 * 2) generic_mbuf_destructor()
332 * na->nm_notify() == netmap_notify()
333 * - rx from netmap userspace:
334 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335 * kring->nm_sync() == generic_netmap_rxsync()
336 * mbq_safe_dequeue()
337 * 2) device driver
338 * generic_rx_handler()
339 * mbq_safe_enqueue()
340 * na->nm_notify() == netmap_notify()
341 * - rx from host stack
342 * FreeBSD: same as native
343 * Linux: same as native except:
344 * 1) host stack
345 * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346 * ifp->ndo_start_xmit == generic_ndo_start_xmit()
347 * netmap_transmit()
348 * na->nm_notify() == netmap_notify()
349 * - tx to host stack (same as native):
350 *
351 *
352 * -= VALE =-
353 *
354 * INCOMING:
355 *
356 * - VALE ports:
357 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
358 * kring->nm_sync() == netmap_vp_txsync()
359 *
360 * - system device with native support:
361 * from cable:
362 * interrupt
363 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364 * kring->nm_sync() == DEVICE_netmap_rxsync()
365 * netmap_vp_txsync()
366 * kring->nm_sync() == DEVICE_netmap_rxsync()
367 * from host stack:
368 * netmap_transmit()
369 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370 * kring->nm_sync() == netmap_rxsync_from_host()
371 * netmap_vp_txsync()
372 *
373 * - system device with generic support:
374 * from device driver:
375 * generic_rx_handler()
376 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377 * kring->nm_sync() == generic_netmap_rxsync()
378 * netmap_vp_txsync()
379 * kring->nm_sync() == generic_netmap_rxsync()
380 * from host stack:
381 * netmap_transmit()
382 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383 * kring->nm_sync() == netmap_rxsync_from_host()
384 * netmap_vp_txsync()
385 *
386 * (all cases) --> nm_bdg_flush()
387 * dest_na->nm_notify() == (see below)
388 *
389 * OUTGOING:
390 *
391 * - VALE ports:
392 * concurrently:
393 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394 * kring->nm_sync() == netmap_vp_rxsync()
395 * 2) from nm_bdg_flush()
396 * na->nm_notify() == netmap_notify()
397 *
398 * - system device with native support:
399 * to cable:
400 * na->nm_notify() == netmap_bwrap_notify()
401 * netmap_vp_rxsync()
402 * kring->nm_sync() == DEVICE_netmap_txsync()
403 * netmap_vp_rxsync()
404 * to host stack:
405 * netmap_vp_rxsync()
406 * kring->nm_sync() == netmap_txsync_to_host
407 * netmap_vp_rxsync_locked()
408 *
409 * - system device with generic adapter:
410 * to device driver:
411 * na->nm_notify() == netmap_bwrap_notify()
412 * netmap_vp_rxsync()
413 * kring->nm_sync() == generic_netmap_txsync()
414 * netmap_vp_rxsync()
415 * to host stack:
416 * netmap_vp_rxsync()
417 * kring->nm_sync() == netmap_txsync_to_host
418 * netmap_vp_rxsync()
419 *
420 */
421
422 /*
423 * OS-specific code that is used only within this file.
424 * Other OS-specific code that must be accessed by drivers
425 * is present in netmap_kern.h
426 */
427
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h> /* defines used in kernel.h */
433 #include <sys/kernel.h> /* types used in module initialization */
434 #include <sys/conf.h> /* cdevsw struct, UID, GID */
435 #include <sys/filio.h> /* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h> /* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
446 #include <net/if.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h> /* BIOCIMMEDIATE */
449 #include <machine/bus.h> /* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
452 #include <net/ethernet.h> /* ETHER_BPF_MTAP */
453
454
455 #elif defined(linux)
456
457 #include "bsd_glue.h"
458
459 #elif defined(__APPLE__)
460
461 #warning OSX support is only partial
462 #include "osx_glue.h"
463
464 #elif defined (_WIN32)
465
466 #include "win_glue.h"
467
468 #else
469
470 #error Unsupported platform
471
472 #endif /* unsupported */
473
474 /*
475 * common headers
476 */
477 #include <net/netmap.h>
478 #include <dev/netmap/netmap_kern.h>
479 #include <dev/netmap/netmap_mem2.h>
480
481
482 /* user-controlled variables */
483 int netmap_verbose;
484 #ifdef CONFIG_NETMAP_DEBUG
485 int netmap_debug;
486 #endif /* CONFIG_NETMAP_DEBUG */
487
488 static int netmap_no_timestamp; /* don't timestamp on rxsync */
489 int netmap_no_pendintr = 1;
490 int netmap_txsync_retry = 2;
491 static int netmap_fwd = 0; /* force transparent forwarding */
492
493 /*
494 * netmap_admode selects the netmap mode to use.
495 * Invalid values are reset to NETMAP_ADMODE_BEST
496 */
497 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
498 NETMAP_ADMODE_NATIVE, /* either native or none */
499 NETMAP_ADMODE_GENERIC, /* force generic */
500 NETMAP_ADMODE_LAST };
501 static int netmap_admode = NETMAP_ADMODE_BEST;
502
503 /* netmap_generic_mit controls mitigation of RX notifications for
504 * the generic netmap adapter. The value is a time interval in
505 * nanoseconds. */
506 int netmap_generic_mit = 100*1000;
507
508 /* We use by default netmap-aware qdiscs with generic netmap adapters,
509 * even if there can be a little performance hit with hardware NICs.
510 * However, using the qdisc is the safer approach, for two reasons:
511 * 1) it prevents non-fifo qdiscs to break the TX notification
512 * scheme, which is based on mbuf destructors when txqdisc is
513 * not used.
514 * 2) it makes it possible to transmit over software devices that
515 * change skb->dev, like bridge, veth, ...
516 *
517 * Anyway users looking for the best performance should
518 * use native adapters.
519 */
520 #ifdef linux
521 int netmap_generic_txqdisc = 1;
522 #endif
523
524 /* Default number of slots and queues for generic adapters. */
525 int netmap_generic_ringsize = 1024;
526 int netmap_generic_rings = 1;
527
528 /* Non-zero to enable checksum offloading in NIC drivers */
529 int netmap_generic_hwcsum = 0;
530
531 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
532 int ptnet_vnet_hdr = 1;
533
534 /*
535 * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
536 * in some other operating systems
537 */
538 SYSBEGIN(main_init);
539
540 SYSCTL_DECL(_dev_netmap);
541 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
543 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
544 #ifdef CONFIG_NETMAP_DEBUG
545 SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
546 CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
547 #endif /* CONFIG_NETMAP_DEBUG */
548 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
549 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
550 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
551 0, "Always look for new received packets.");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
553 &netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
554
555 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
556 "Force NR_FORWARD mode");
557 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
558 "Adapter mode. 0 selects the best option available,"
559 "1 forces native adapter, 2 forces emulated adapter");
560 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
561 0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
562 "1 to enable checksum generation by the NIC");
563 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
564 0, "RX notification interval in nanoseconds");
565 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
566 &netmap_generic_ringsize, 0,
567 "Number of per-ring slots for emulated netmap mode");
568 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
569 &netmap_generic_rings, 0,
570 "Number of TX/RX queues for emulated netmap adapters");
571 #ifdef linux
572 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
573 &netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
574 #endif
575 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
576 0, "Allow ptnet devices to use virtio-net headers");
577
578 SYSEND;
579
580 NMG_LOCK_T netmap_global_lock;
581
582 /*
583 * mark the ring as stopped, and run through the locks
584 * to make sure other users get to see it.
585 * stopped must be either NR_KR_STOPPED (for unbounded stop)
586 * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
587 */
588 static void
netmap_disable_ring(struct netmap_kring * kr,int stopped)589 netmap_disable_ring(struct netmap_kring *kr, int stopped)
590 {
591 nm_kr_stop(kr, stopped);
592 // XXX check if nm_kr_stop is sufficient
593 mtx_lock(&kr->q_lock);
594 mtx_unlock(&kr->q_lock);
595 nm_kr_put(kr);
596 }
597
598 /* stop or enable a single ring */
599 void
netmap_set_ring(struct netmap_adapter * na,u_int ring_id,enum txrx t,int stopped)600 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
601 {
602 if (stopped)
603 netmap_disable_ring(NMR(na, t)[ring_id], stopped);
604 else
605 NMR(na, t)[ring_id]->nkr_stopped = 0;
606 }
607
608
609 /* stop or enable all the rings of na */
610 void
netmap_set_all_rings(struct netmap_adapter * na,int stopped)611 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
612 {
613 int i;
614 enum txrx t;
615
616 if (!nm_netmap_on(na))
617 return;
618
619 if (netmap_verbose) {
620 nm_prinf("%s: %sable all rings", na->name,
621 (stopped ? "dis" : "en"));
622 }
623 for_rx_tx(t) {
624 for (i = 0; i < netmap_real_rings(na, t); i++) {
625 netmap_set_ring(na, i, t, stopped);
626 }
627 }
628 }
629
630 /*
631 * Convenience function used in drivers. Waits for current txsync()s/rxsync()s
632 * to finish and prevents any new one from starting. Call this before turning
633 * netmap mode off, or before removing the hardware rings (e.g., on module
634 * onload).
635 */
636 void
netmap_disable_all_rings(struct ifnet * ifp)637 netmap_disable_all_rings(struct ifnet *ifp)
638 {
639 if (NM_NA_VALID(ifp)) {
640 netmap_set_all_rings(NA(ifp), NM_KR_LOCKED);
641 }
642 }
643
644 /*
645 * Convenience function used in drivers. Re-enables rxsync and txsync on the
646 * adapter's rings In linux drivers, this should be placed near each
647 * napi_enable().
648 */
649 void
netmap_enable_all_rings(struct ifnet * ifp)650 netmap_enable_all_rings(struct ifnet *ifp)
651 {
652 if (NM_NA_VALID(ifp)) {
653 netmap_set_all_rings(NA(ifp), 0 /* enabled */);
654 }
655 }
656
657 void
netmap_make_zombie(struct ifnet * ifp)658 netmap_make_zombie(struct ifnet *ifp)
659 {
660 if (NM_NA_VALID(ifp)) {
661 struct netmap_adapter *na = NA(ifp);
662 netmap_set_all_rings(na, NM_KR_LOCKED);
663 na->na_flags |= NAF_ZOMBIE;
664 netmap_set_all_rings(na, 0);
665 }
666 }
667
668 void
netmap_undo_zombie(struct ifnet * ifp)669 netmap_undo_zombie(struct ifnet *ifp)
670 {
671 if (NM_NA_VALID(ifp)) {
672 struct netmap_adapter *na = NA(ifp);
673 if (na->na_flags & NAF_ZOMBIE) {
674 netmap_set_all_rings(na, NM_KR_LOCKED);
675 na->na_flags &= ~NAF_ZOMBIE;
676 netmap_set_all_rings(na, 0);
677 }
678 }
679 }
680
681 /*
682 * generic bound_checking function
683 */
684 u_int
nm_bound_var(u_int * v,u_int dflt,u_int lo,u_int hi,const char * msg)685 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
686 {
687 u_int oldv = *v;
688 const char *op = NULL;
689
690 if (dflt < lo)
691 dflt = lo;
692 if (dflt > hi)
693 dflt = hi;
694 if (oldv < lo) {
695 *v = dflt;
696 op = "Bump";
697 } else if (oldv > hi) {
698 *v = hi;
699 op = "Clamp";
700 }
701 if (op && msg)
702 nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
703 return *v;
704 }
705
706
707 /*
708 * packet-dump function, user-supplied or static buffer.
709 * The destination buffer must be at least 30+4*len
710 */
711 const char *
nm_dump_buf(char * p,int len,int lim,char * dst)712 nm_dump_buf(char *p, int len, int lim, char *dst)
713 {
714 static char _dst[8192];
715 int i, j, i0;
716 static char hex[] ="0123456789abcdef";
717 char *o; /* output position */
718
719 #define P_HI(x) hex[((x) & 0xf0)>>4]
720 #define P_LO(x) hex[((x) & 0xf)]
721 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
722 if (!dst)
723 dst = _dst;
724 if (lim <= 0 || lim > len)
725 lim = len;
726 o = dst;
727 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
728 o += strlen(o);
729 /* hexdump routine */
730 for (i = 0; i < lim; ) {
731 sprintf(o, "%5d: ", i);
732 o += strlen(o);
733 memset(o, ' ', 48);
734 i0 = i;
735 for (j=0; j < 16 && i < lim; i++, j++) {
736 o[j*3] = P_HI(p[i]);
737 o[j*3+1] = P_LO(p[i]);
738 }
739 i = i0;
740 for (j=0; j < 16 && i < lim; i++, j++)
741 o[j + 48] = P_C(p[i]);
742 o[j+48] = '\n';
743 o += j+49;
744 }
745 *o = '\0';
746 #undef P_HI
747 #undef P_LO
748 #undef P_C
749 return dst;
750 }
751
752
753 /*
754 * Fetch configuration from the device, to cope with dynamic
755 * reconfigurations after loading the module.
756 */
757 /* call with NMG_LOCK held */
758 int
netmap_update_config(struct netmap_adapter * na)759 netmap_update_config(struct netmap_adapter *na)
760 {
761 struct nm_config_info info;
762
763 if (na->ifp && !nm_is_bwrap(na)) {
764 strlcpy(na->name, na->ifp->if_xname, sizeof(na->name));
765 }
766
767 bzero(&info, sizeof(info));
768 if (na->nm_config == NULL ||
769 na->nm_config(na, &info)) {
770 /* take whatever we had at init time */
771 info.num_tx_rings = na->num_tx_rings;
772 info.num_tx_descs = na->num_tx_desc;
773 info.num_rx_rings = na->num_rx_rings;
774 info.num_rx_descs = na->num_rx_desc;
775 info.rx_buf_maxsize = na->rx_buf_maxsize;
776 }
777
778 if (na->num_tx_rings == info.num_tx_rings &&
779 na->num_tx_desc == info.num_tx_descs &&
780 na->num_rx_rings == info.num_rx_rings &&
781 na->num_rx_desc == info.num_rx_descs &&
782 na->rx_buf_maxsize == info.rx_buf_maxsize)
783 return 0; /* nothing changed */
784 if (na->active_fds == 0) {
785 na->num_tx_rings = info.num_tx_rings;
786 na->num_tx_desc = info.num_tx_descs;
787 na->num_rx_rings = info.num_rx_rings;
788 na->num_rx_desc = info.num_rx_descs;
789 na->rx_buf_maxsize = info.rx_buf_maxsize;
790 if (netmap_verbose)
791 nm_prinf("configuration changed for %s: txring %d x %d, "
792 "rxring %d x %d, rxbufsz %d",
793 na->name, na->num_tx_rings, na->num_tx_desc,
794 na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
795 return 0;
796 }
797 nm_prerr("WARNING: configuration changed for %s while active: "
798 "txring %d x %d, rxring %d x %d, rxbufsz %d",
799 na->name, info.num_tx_rings, info.num_tx_descs,
800 info.num_rx_rings, info.num_rx_descs,
801 info.rx_buf_maxsize);
802 return 1;
803 }
804
805 /* nm_sync callbacks for the host rings */
806 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
807 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
808
809 /* create the krings array and initialize the fields common to all adapters.
810 * The array layout is this:
811 *
812 * +----------+
813 * na->tx_rings ----->| | \
814 * | | } na->num_tx_ring
815 * | | /
816 * +----------+
817 * | | host tx kring
818 * na->rx_rings ----> +----------+
819 * | | \
820 * | | } na->num_rx_rings
821 * | | /
822 * +----------+
823 * | | host rx kring
824 * +----------+
825 * na->tailroom ----->| | \
826 * | | } tailroom bytes
827 * | | /
828 * +----------+
829 *
830 * Note: for compatibility, host krings are created even when not needed.
831 * The tailroom space is currently used by vale ports for allocating leases.
832 */
833 /* call with NMG_LOCK held */
834 int
netmap_krings_create(struct netmap_adapter * na,u_int tailroom)835 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
836 {
837 u_int i, len, ndesc;
838 struct netmap_kring *kring;
839 u_int n[NR_TXRX];
840 enum txrx t;
841 int err = 0;
842
843 if (na->tx_rings != NULL) {
844 if (netmap_debug & NM_DEBUG_ON)
845 nm_prerr("warning: krings were already created");
846 return 0;
847 }
848
849 /* account for the (possibly fake) host rings */
850 n[NR_TX] = netmap_all_rings(na, NR_TX);
851 n[NR_RX] = netmap_all_rings(na, NR_RX);
852
853 len = (n[NR_TX] + n[NR_RX]) *
854 (sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
855 + tailroom;
856
857 na->tx_rings = nm_os_malloc((size_t)len);
858 if (na->tx_rings == NULL) {
859 nm_prerr("Cannot allocate krings");
860 return ENOMEM;
861 }
862 na->rx_rings = na->tx_rings + n[NR_TX];
863 na->tailroom = na->rx_rings + n[NR_RX];
864
865 /* link the krings in the krings array */
866 kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
867 for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
868 na->tx_rings[i] = kring;
869 kring++;
870 }
871
872 /*
873 * All fields in krings are 0 except the one initialized below.
874 * but better be explicit on important kring fields.
875 */
876 for_rx_tx(t) {
877 ndesc = nma_get_ndesc(na, t);
878 for (i = 0; i < n[t]; i++) {
879 kring = NMR(na, t)[i];
880 bzero(kring, sizeof(*kring));
881 kring->notify_na = na;
882 kring->ring_id = i;
883 kring->tx = t;
884 kring->nkr_num_slots = ndesc;
885 kring->nr_mode = NKR_NETMAP_OFF;
886 kring->nr_pending_mode = NKR_NETMAP_OFF;
887 if (i < nma_get_nrings(na, t)) {
888 kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
889 } else {
890 if (!(na->na_flags & NAF_HOST_RINGS))
891 kring->nr_kflags |= NKR_FAKERING;
892 kring->nm_sync = (t == NR_TX ?
893 netmap_txsync_to_host:
894 netmap_rxsync_from_host);
895 }
896 kring->nm_notify = na->nm_notify;
897 kring->rhead = kring->rcur = kring->nr_hwcur = 0;
898 /*
899 * IMPORTANT: Always keep one slot empty.
900 */
901 kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
902 snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
903 nm_txrx2str(t), i);
904 nm_prdis("ktx %s h %d c %d t %d",
905 kring->name, kring->rhead, kring->rcur, kring->rtail);
906 err = nm_os_selinfo_init(&kring->si, kring->name);
907 if (err) {
908 netmap_krings_delete(na);
909 return err;
910 }
911 mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
912 kring->na = na; /* setting this field marks the mutex as initialized */
913 }
914 err = nm_os_selinfo_init(&na->si[t], na->name);
915 if (err) {
916 netmap_krings_delete(na);
917 return err;
918 }
919 }
920
921 return 0;
922 }
923
924
925 /* undo the actions performed by netmap_krings_create */
926 /* call with NMG_LOCK held */
927 void
netmap_krings_delete(struct netmap_adapter * na)928 netmap_krings_delete(struct netmap_adapter *na)
929 {
930 struct netmap_kring **kring = na->tx_rings;
931 enum txrx t;
932
933 if (na->tx_rings == NULL) {
934 if (netmap_debug & NM_DEBUG_ON)
935 nm_prerr("warning: krings were already deleted");
936 return;
937 }
938
939 for_rx_tx(t)
940 nm_os_selinfo_uninit(&na->si[t]);
941
942 /* we rely on the krings layout described above */
943 for ( ; kring != na->tailroom; kring++) {
944 if ((*kring)->na != NULL)
945 mtx_destroy(&(*kring)->q_lock);
946 nm_os_selinfo_uninit(&(*kring)->si);
947 }
948 nm_os_free(na->tx_rings);
949 na->tx_rings = na->rx_rings = na->tailroom = NULL;
950 }
951
952
953 /*
954 * Destructor for NIC ports. They also have an mbuf queue
955 * on the rings connected to the host so we need to purge
956 * them first.
957 */
958 /* call with NMG_LOCK held */
959 void
netmap_hw_krings_delete(struct netmap_adapter * na)960 netmap_hw_krings_delete(struct netmap_adapter *na)
961 {
962 u_int lim = netmap_real_rings(na, NR_RX), i;
963
964 for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
965 struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
966 nm_prdis("destroy sw mbq with len %d", mbq_len(q));
967 mbq_purge(q);
968 mbq_safe_fini(q);
969 }
970 netmap_krings_delete(na);
971 }
972
973 static void
netmap_mem_drop(struct netmap_adapter * na)974 netmap_mem_drop(struct netmap_adapter *na)
975 {
976 int last = netmap_mem_deref(na->nm_mem, na);
977 /* if the native allocator had been overrided on regif,
978 * restore it now and drop the temporary one
979 */
980 if (last && na->nm_mem_prev) {
981 netmap_mem_put(na->nm_mem);
982 na->nm_mem = na->nm_mem_prev;
983 na->nm_mem_prev = NULL;
984 }
985 }
986
987 /*
988 * Undo everything that was done in netmap_do_regif(). In particular,
989 * call nm_register(ifp,0) to stop netmap mode on the interface and
990 * revert to normal operation.
991 */
992 /* call with NMG_LOCK held */
993 static void netmap_unset_ringid(struct netmap_priv_d *);
994 static void netmap_krings_put(struct netmap_priv_d *);
995 void
netmap_do_unregif(struct netmap_priv_d * priv)996 netmap_do_unregif(struct netmap_priv_d *priv)
997 {
998 struct netmap_adapter *na = priv->np_na;
999
1000 NMG_LOCK_ASSERT();
1001 na->active_fds--;
1002 /* unset nr_pending_mode and possibly release exclusive mode */
1003 netmap_krings_put(priv);
1004
1005 #ifdef WITH_MONITOR
1006 /* XXX check whether we have to do something with monitor
1007 * when rings change nr_mode. */
1008 if (na->active_fds <= 0) {
1009 /* walk through all the rings and tell any monitor
1010 * that the port is going to exit netmap mode
1011 */
1012 netmap_monitor_stop(na);
1013 }
1014 #endif
1015
1016 if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1017 na->nm_register(na, 0);
1018 }
1019
1020 /* delete rings and buffers that are no longer needed */
1021 netmap_mem_rings_delete(na);
1022
1023 if (na->active_fds <= 0) { /* last instance */
1024 /*
1025 * (TO CHECK) We enter here
1026 * when the last reference to this file descriptor goes
1027 * away. This means we cannot have any pending poll()
1028 * or interrupt routine operating on the structure.
1029 * XXX The file may be closed in a thread while
1030 * another thread is using it.
1031 * Linux keeps the file opened until the last reference
1032 * by any outstanding ioctl/poll or mmap is gone.
1033 * FreeBSD does not track mmap()s (but we do) and
1034 * wakes up any sleeping poll(). Need to check what
1035 * happens if the close() occurs while a concurrent
1036 * syscall is running.
1037 */
1038 if (netmap_debug & NM_DEBUG_ON)
1039 nm_prinf("deleting last instance for %s", na->name);
1040
1041 if (nm_netmap_on(na)) {
1042 nm_prerr("BUG: netmap on while going to delete the krings");
1043 }
1044
1045 na->nm_krings_delete(na);
1046
1047 /* restore the default number of host tx and rx rings */
1048 if (na->na_flags & NAF_HOST_RINGS) {
1049 na->num_host_tx_rings = 1;
1050 na->num_host_rx_rings = 1;
1051 } else {
1052 na->num_host_tx_rings = 0;
1053 na->num_host_rx_rings = 0;
1054 }
1055 }
1056
1057 /* possibily decrement counter of tx_si/rx_si users */
1058 netmap_unset_ringid(priv);
1059 /* delete the nifp */
1060 netmap_mem_if_delete(na, priv->np_nifp);
1061 /* drop the allocator */
1062 netmap_mem_drop(na);
1063 /* mark the priv as unregistered */
1064 priv->np_na = NULL;
1065 priv->np_nifp = NULL;
1066 }
1067
1068 struct netmap_priv_d*
netmap_priv_new(void)1069 netmap_priv_new(void)
1070 {
1071 struct netmap_priv_d *priv;
1072
1073 priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1074 if (priv == NULL)
1075 return NULL;
1076 priv->np_refs = 1;
1077 nm_os_get_module();
1078 return priv;
1079 }
1080
1081 /*
1082 * Destructor of the netmap_priv_d, called when the fd is closed
1083 * Action: undo all the things done by NIOCREGIF,
1084 * On FreeBSD we need to track whether there are active mmap()s,
1085 * and we use np_active_mmaps for that. On linux, the field is always 0.
1086 * Return: 1 if we can free priv, 0 otherwise.
1087 *
1088 */
1089 /* call with NMG_LOCK held */
1090 void
netmap_priv_delete(struct netmap_priv_d * priv)1091 netmap_priv_delete(struct netmap_priv_d *priv)
1092 {
1093 struct netmap_adapter *na = priv->np_na;
1094
1095 /* number of active references to this fd */
1096 if (--priv->np_refs > 0) {
1097 return;
1098 }
1099 nm_os_put_module();
1100 if (na) {
1101 netmap_do_unregif(priv);
1102 }
1103 netmap_unget_na(na, priv->np_ifp);
1104 bzero(priv, sizeof(*priv)); /* for safety */
1105 nm_os_free(priv);
1106 }
1107
1108
1109 /* call with NMG_LOCK *not* held */
1110 void
netmap_dtor(void * data)1111 netmap_dtor(void *data)
1112 {
1113 struct netmap_priv_d *priv = data;
1114
1115 NMG_LOCK();
1116 netmap_priv_delete(priv);
1117 NMG_UNLOCK();
1118 }
1119
1120
1121 /*
1122 * Handlers for synchronization of the rings from/to the host stack.
1123 * These are associated to a network interface and are just another
1124 * ring pair managed by userspace.
1125 *
1126 * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1127 * flags):
1128 *
1129 * - Before releasing buffers on hw RX rings, the application can mark
1130 * them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1131 * will be forwarded to the host stack, similarly to what happened if
1132 * the application moved them to the host TX ring.
1133 *
1134 * - Before releasing buffers on the host RX ring, the application can
1135 * mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1136 * they will be forwarded to the hw TX rings, saving the application
1137 * from doing the same task in user-space.
1138 *
1139 * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1140 * flag, or globally with the netmap_fwd sysctl.
1141 *
1142 * The transfer NIC --> host is relatively easy, just encapsulate
1143 * into mbufs and we are done. The host --> NIC side is slightly
1144 * harder because there might not be room in the tx ring so it
1145 * might take a while before releasing the buffer.
1146 */
1147
1148
1149 /*
1150 * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1151 * We do not need to lock because the queue is private.
1152 * After this call the queue is empty.
1153 */
1154 static void
netmap_send_up(struct ifnet * dst,struct mbq * q)1155 netmap_send_up(struct ifnet *dst, struct mbq *q)
1156 {
1157 struct mbuf *m;
1158 struct mbuf *head = NULL, *prev = NULL;
1159
1160 /* Send packets up, outside the lock; head/prev machinery
1161 * is only useful for Windows. */
1162 while ((m = mbq_dequeue(q)) != NULL) {
1163 if (netmap_debug & NM_DEBUG_HOST)
1164 nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1165 prev = nm_os_send_up(dst, m, prev);
1166 if (head == NULL)
1167 head = prev;
1168 }
1169 if (head)
1170 nm_os_send_up(dst, NULL, head);
1171 mbq_fini(q);
1172 }
1173
1174
1175 /*
1176 * Scan the buffers from hwcur to ring->head, and put a copy of those
1177 * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1178 * Drop remaining packets in the unlikely event
1179 * of an mbuf shortage.
1180 */
1181 static void
netmap_grab_packets(struct netmap_kring * kring,struct mbq * q,int force)1182 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1183 {
1184 u_int const lim = kring->nkr_num_slots - 1;
1185 u_int const head = kring->rhead;
1186 u_int n;
1187 struct netmap_adapter *na = kring->na;
1188
1189 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1190 struct mbuf *m;
1191 struct netmap_slot *slot = &kring->ring->slot[n];
1192
1193 if ((slot->flags & NS_FORWARD) == 0 && !force)
1194 continue;
1195 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1196 nm_prlim(5, "bad pkt at %d len %d", n, slot->len);
1197 continue;
1198 }
1199 slot->flags &= ~NS_FORWARD; // XXX needed ?
1200 /* XXX TODO: adapt to the case of a multisegment packet */
1201 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1202
1203 if (m == NULL)
1204 break;
1205 mbq_enqueue(q, m);
1206 }
1207 }
1208
1209 static inline int
_nm_may_forward(struct netmap_kring * kring)1210 _nm_may_forward(struct netmap_kring *kring)
1211 {
1212 return ((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1213 kring->na->na_flags & NAF_HOST_RINGS &&
1214 kring->tx == NR_RX);
1215 }
1216
1217 static inline int
nm_may_forward_up(struct netmap_kring * kring)1218 nm_may_forward_up(struct netmap_kring *kring)
1219 {
1220 return _nm_may_forward(kring) &&
1221 kring->ring_id != kring->na->num_rx_rings;
1222 }
1223
1224 static inline int
nm_may_forward_down(struct netmap_kring * kring,int sync_flags)1225 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1226 {
1227 return _nm_may_forward(kring) &&
1228 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1229 kring->ring_id == kring->na->num_rx_rings;
1230 }
1231
1232 /*
1233 * Send to the NIC rings packets marked NS_FORWARD between
1234 * kring->nr_hwcur and kring->rhead.
1235 * Called under kring->rx_queue.lock on the sw rx ring.
1236 *
1237 * It can only be called if the user opened all the TX hw rings,
1238 * see NAF_CAN_FORWARD_DOWN flag.
1239 * We can touch the TX netmap rings (slots, head and cur) since
1240 * we are in poll/ioctl system call context, and the application
1241 * is not supposed to touch the ring (using a different thread)
1242 * during the execution of the system call.
1243 */
1244 static u_int
netmap_sw_to_nic(struct netmap_adapter * na)1245 netmap_sw_to_nic(struct netmap_adapter *na)
1246 {
1247 struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1248 struct netmap_slot *rxslot = kring->ring->slot;
1249 u_int i, rxcur = kring->nr_hwcur;
1250 u_int const head = kring->rhead;
1251 u_int const src_lim = kring->nkr_num_slots - 1;
1252 u_int sent = 0;
1253
1254 /* scan rings to find space, then fill as much as possible */
1255 for (i = 0; i < na->num_tx_rings; i++) {
1256 struct netmap_kring *kdst = na->tx_rings[i];
1257 struct netmap_ring *rdst = kdst->ring;
1258 u_int const dst_lim = kdst->nkr_num_slots - 1;
1259
1260 /* XXX do we trust ring or kring->rcur,rtail ? */
1261 for (; rxcur != head && !nm_ring_empty(rdst);
1262 rxcur = nm_next(rxcur, src_lim) ) {
1263 struct netmap_slot *src, *dst, tmp;
1264 u_int dst_head = rdst->head;
1265
1266 src = &rxslot[rxcur];
1267 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1268 continue;
1269
1270 sent++;
1271
1272 dst = &rdst->slot[dst_head];
1273
1274 tmp = *src;
1275
1276 src->buf_idx = dst->buf_idx;
1277 src->flags = NS_BUF_CHANGED;
1278
1279 dst->buf_idx = tmp.buf_idx;
1280 dst->len = tmp.len;
1281 dst->flags = NS_BUF_CHANGED;
1282
1283 rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1284 }
1285 /* if (sent) XXX txsync ? it would be just an optimization */
1286 }
1287 return sent;
1288 }
1289
1290
1291 /*
1292 * netmap_txsync_to_host() passes packets up. We are called from a
1293 * system call in user process context, and the only contention
1294 * can be among multiple user threads erroneously calling
1295 * this routine concurrently.
1296 */
1297 static int
netmap_txsync_to_host(struct netmap_kring * kring,int flags)1298 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1299 {
1300 struct netmap_adapter *na = kring->na;
1301 u_int const lim = kring->nkr_num_slots - 1;
1302 u_int const head = kring->rhead;
1303 struct mbq q;
1304
1305 /* Take packets from hwcur to head and pass them up.
1306 * Force hwcur = head since netmap_grab_packets() stops at head
1307 */
1308 mbq_init(&q);
1309 netmap_grab_packets(kring, &q, 1 /* force */);
1310 nm_prdis("have %d pkts in queue", mbq_len(&q));
1311 kring->nr_hwcur = head;
1312 kring->nr_hwtail = head + lim;
1313 if (kring->nr_hwtail > lim)
1314 kring->nr_hwtail -= lim + 1;
1315
1316 netmap_send_up(na->ifp, &q);
1317 return 0;
1318 }
1319
1320
1321 /*
1322 * rxsync backend for packets coming from the host stack.
1323 * They have been put in kring->rx_queue by netmap_transmit().
1324 * We protect access to the kring using kring->rx_queue.lock
1325 *
1326 * also moves to the nic hw rings any packet the user has marked
1327 * for transparent-mode forwarding, then sets the NR_FORWARD
1328 * flag in the kring to let the caller push them out
1329 */
1330 static int
netmap_rxsync_from_host(struct netmap_kring * kring,int flags)1331 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1332 {
1333 struct netmap_adapter *na = kring->na;
1334 struct netmap_ring *ring = kring->ring;
1335 u_int nm_i, n;
1336 u_int const lim = kring->nkr_num_slots - 1;
1337 u_int const head = kring->rhead;
1338 int ret = 0;
1339 struct mbq *q = &kring->rx_queue, fq;
1340
1341 mbq_init(&fq); /* fq holds packets to be freed */
1342
1343 mbq_lock(q);
1344
1345 /* First part: import newly received packets */
1346 n = mbq_len(q);
1347 if (n) { /* grab packets from the queue */
1348 struct mbuf *m;
1349 uint32_t stop_i;
1350
1351 nm_i = kring->nr_hwtail;
1352 stop_i = nm_prev(kring->nr_hwcur, lim);
1353 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1354 int len = MBUF_LEN(m);
1355 struct netmap_slot *slot = &ring->slot[nm_i];
1356
1357 m_copydata(m, 0, len, NMB(na, slot));
1358 nm_prdis("nm %d len %d", nm_i, len);
1359 if (netmap_debug & NM_DEBUG_HOST)
1360 nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1361
1362 slot->len = len;
1363 slot->flags = 0;
1364 nm_i = nm_next(nm_i, lim);
1365 mbq_enqueue(&fq, m);
1366 }
1367 kring->nr_hwtail = nm_i;
1368 }
1369
1370 /*
1371 * Second part: skip past packets that userspace has released.
1372 */
1373 nm_i = kring->nr_hwcur;
1374 if (nm_i != head) { /* something was released */
1375 if (nm_may_forward_down(kring, flags)) {
1376 ret = netmap_sw_to_nic(na);
1377 if (ret > 0) {
1378 kring->nr_kflags |= NR_FORWARD;
1379 ret = 0;
1380 }
1381 }
1382 kring->nr_hwcur = head;
1383 }
1384
1385 mbq_unlock(q);
1386
1387 mbq_purge(&fq);
1388 mbq_fini(&fq);
1389
1390 return ret;
1391 }
1392
1393
1394 /* Get a netmap adapter for the port.
1395 *
1396 * If it is possible to satisfy the request, return 0
1397 * with *na containing the netmap adapter found.
1398 * Otherwise return an error code, with *na containing NULL.
1399 *
1400 * When the port is attached to a bridge, we always return
1401 * EBUSY.
1402 * Otherwise, if the port is already bound to a file descriptor,
1403 * then we unconditionally return the existing adapter into *na.
1404 * In all the other cases, we return (into *na) either native,
1405 * generic or NULL, according to the following table:
1406 *
1407 * native_support
1408 * active_fds dev.netmap.admode YES NO
1409 * -------------------------------------------------------
1410 * >0 * NA(ifp) NA(ifp)
1411 *
1412 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC
1413 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL
1414 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC
1415 *
1416 */
1417 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1418 int
netmap_get_hw_na(struct ifnet * ifp,struct netmap_mem_d * nmd,struct netmap_adapter ** na)1419 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1420 {
1421 /* generic support */
1422 int i = netmap_admode; /* Take a snapshot. */
1423 struct netmap_adapter *prev_na;
1424 int error = 0;
1425
1426 *na = NULL; /* default */
1427
1428 /* reset in case of invalid value */
1429 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1430 i = netmap_admode = NETMAP_ADMODE_BEST;
1431
1432 if (NM_NA_VALID(ifp)) {
1433 prev_na = NA(ifp);
1434 /* If an adapter already exists, return it if
1435 * there are active file descriptors or if
1436 * netmap is not forced to use generic
1437 * adapters.
1438 */
1439 if (NETMAP_OWNED_BY_ANY(prev_na)
1440 || i != NETMAP_ADMODE_GENERIC
1441 || prev_na->na_flags & NAF_FORCE_NATIVE
1442 #ifdef WITH_PIPES
1443 /* ugly, but we cannot allow an adapter switch
1444 * if some pipe is referring to this one
1445 */
1446 || prev_na->na_next_pipe > 0
1447 #endif
1448 ) {
1449 *na = prev_na;
1450 goto assign_mem;
1451 }
1452 }
1453
1454 /* If there isn't native support and netmap is not allowed
1455 * to use generic adapters, we cannot satisfy the request.
1456 */
1457 if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1458 return EOPNOTSUPP;
1459
1460 /* Otherwise, create a generic adapter and return it,
1461 * saving the previously used netmap adapter, if any.
1462 *
1463 * Note that here 'prev_na', if not NULL, MUST be a
1464 * native adapter, and CANNOT be a generic one. This is
1465 * true because generic adapters are created on demand, and
1466 * destroyed when not used anymore. Therefore, if the adapter
1467 * currently attached to an interface 'ifp' is generic, it
1468 * must be that
1469 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1470 * Consequently, if NA(ifp) is generic, we will enter one of
1471 * the branches above. This ensures that we never override
1472 * a generic adapter with another generic adapter.
1473 */
1474 error = generic_netmap_attach(ifp);
1475 if (error)
1476 return error;
1477
1478 *na = NA(ifp);
1479
1480 assign_mem:
1481 if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1482 (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1483 (*na)->nm_mem_prev = (*na)->nm_mem;
1484 (*na)->nm_mem = netmap_mem_get(nmd);
1485 }
1486
1487 return 0;
1488 }
1489
1490 /*
1491 * MUST BE CALLED UNDER NMG_LOCK()
1492 *
1493 * Get a refcounted reference to a netmap adapter attached
1494 * to the interface specified by req.
1495 * This is always called in the execution of an ioctl().
1496 *
1497 * Return ENXIO if the interface specified by the request does
1498 * not exist, ENOTSUP if netmap is not supported by the interface,
1499 * EBUSY if the interface is already attached to a bridge,
1500 * EINVAL if parameters are invalid, ENOMEM if needed resources
1501 * could not be allocated.
1502 * If successful, hold a reference to the netmap adapter.
1503 *
1504 * If the interface specified by req is a system one, also keep
1505 * a reference to it and return a valid *ifp.
1506 */
1507 int
netmap_get_na(struct nmreq_header * hdr,struct netmap_adapter ** na,struct ifnet ** ifp,struct netmap_mem_d * nmd,int create)1508 netmap_get_na(struct nmreq_header *hdr,
1509 struct netmap_adapter **na, struct ifnet **ifp,
1510 struct netmap_mem_d *nmd, int create)
1511 {
1512 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1513 int error = 0;
1514 struct netmap_adapter *ret = NULL;
1515 int nmd_ref = 0;
1516
1517 *na = NULL; /* default return value */
1518 *ifp = NULL;
1519
1520 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1521 return EINVAL;
1522 }
1523
1524 if (req->nr_mode == NR_REG_PIPE_MASTER ||
1525 req->nr_mode == NR_REG_PIPE_SLAVE) {
1526 /* Do not accept deprecated pipe modes. */
1527 nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1528 return EINVAL;
1529 }
1530
1531 NMG_LOCK_ASSERT();
1532
1533 /* if the request contain a memid, try to find the
1534 * corresponding memory region
1535 */
1536 if (nmd == NULL && req->nr_mem_id) {
1537 nmd = netmap_mem_find(req->nr_mem_id);
1538 if (nmd == NULL)
1539 return EINVAL;
1540 /* keep the rereference */
1541 nmd_ref = 1;
1542 }
1543
1544 /* We cascade through all possible types of netmap adapter.
1545 * All netmap_get_*_na() functions return an error and an na,
1546 * with the following combinations:
1547 *
1548 * error na
1549 * 0 NULL type doesn't match
1550 * !0 NULL type matches, but na creation/lookup failed
1551 * 0 !NULL type matches and na created/found
1552 * !0 !NULL impossible
1553 */
1554 error = netmap_get_null_na(hdr, na, nmd, create);
1555 if (error || *na != NULL)
1556 goto out;
1557
1558 /* try to see if this is a monitor port */
1559 error = netmap_get_monitor_na(hdr, na, nmd, create);
1560 if (error || *na != NULL)
1561 goto out;
1562
1563 /* try to see if this is a pipe port */
1564 error = netmap_get_pipe_na(hdr, na, nmd, create);
1565 if (error || *na != NULL)
1566 goto out;
1567
1568 /* try to see if this is a bridge port */
1569 error = netmap_get_vale_na(hdr, na, nmd, create);
1570 if (error)
1571 goto out;
1572
1573 if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1574 goto out;
1575
1576 /*
1577 * This must be a hardware na, lookup the name in the system.
1578 * Note that by hardware we actually mean "it shows up in ifconfig".
1579 * This may still be a tap, a veth/epair, or even a
1580 * persistent VALE port.
1581 */
1582 *ifp = ifunit_ref(hdr->nr_name);
1583 if (*ifp == NULL) {
1584 error = ENXIO;
1585 goto out;
1586 }
1587
1588 error = netmap_get_hw_na(*ifp, nmd, &ret);
1589 if (error)
1590 goto out;
1591
1592 *na = ret;
1593 netmap_adapter_get(ret);
1594
1595 /*
1596 * if the adapter supports the host rings and it is not alread open,
1597 * try to set the number of host rings as requested by the user
1598 */
1599 if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) {
1600 if (req->nr_host_tx_rings)
1601 (*na)->num_host_tx_rings = req->nr_host_tx_rings;
1602 if (req->nr_host_rx_rings)
1603 (*na)->num_host_rx_rings = req->nr_host_rx_rings;
1604 }
1605 nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings,
1606 (*na)->num_host_rx_rings);
1607
1608 out:
1609 if (error) {
1610 if (ret)
1611 netmap_adapter_put(ret);
1612 if (*ifp) {
1613 if_rele(*ifp);
1614 *ifp = NULL;
1615 }
1616 }
1617 if (nmd_ref)
1618 netmap_mem_put(nmd);
1619
1620 return error;
1621 }
1622
1623 /* undo netmap_get_na() */
1624 void
netmap_unget_na(struct netmap_adapter * na,struct ifnet * ifp)1625 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1626 {
1627 if (ifp)
1628 if_rele(ifp);
1629 if (na)
1630 netmap_adapter_put(na);
1631 }
1632
1633
1634 #define NM_FAIL_ON(t) do { \
1635 if (unlikely(t)) { \
1636 nm_prlim(5, "%s: fail '" #t "' " \
1637 "h %d c %d t %d " \
1638 "rh %d rc %d rt %d " \
1639 "hc %d ht %d", \
1640 kring->name, \
1641 head, cur, ring->tail, \
1642 kring->rhead, kring->rcur, kring->rtail, \
1643 kring->nr_hwcur, kring->nr_hwtail); \
1644 return kring->nkr_num_slots; \
1645 } \
1646 } while (0)
1647
1648 /*
1649 * validate parameters on entry for *_txsync()
1650 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1651 * in case of error.
1652 *
1653 * rhead, rcur and rtail=hwtail are stored from previous round.
1654 * hwcur is the next packet to send to the ring.
1655 *
1656 * We want
1657 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1658 *
1659 * hwcur, rhead, rtail and hwtail are reliable
1660 */
1661 u_int
nm_txsync_prologue(struct netmap_kring * kring,struct netmap_ring * ring)1662 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1663 {
1664 u_int head = ring->head; /* read only once */
1665 u_int cur = ring->cur; /* read only once */
1666 u_int n = kring->nkr_num_slots;
1667
1668 nm_prdis(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1669 kring->name,
1670 kring->nr_hwcur, kring->nr_hwtail,
1671 ring->head, ring->cur, ring->tail);
1672 #if 1 /* kernel sanity checks; but we can trust the kring. */
1673 NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1674 kring->rtail >= n || kring->nr_hwtail >= n);
1675 #endif /* kernel sanity checks */
1676 /*
1677 * user sanity checks. We only use head,
1678 * A, B, ... are possible positions for head:
1679 *
1680 * 0 A rhead B rtail C n-1
1681 * 0 D rtail E rhead F n-1
1682 *
1683 * B, F, D are valid. A, C, E are wrong
1684 */
1685 if (kring->rtail >= kring->rhead) {
1686 /* want rhead <= head <= rtail */
1687 NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1688 /* and also head <= cur <= rtail */
1689 NM_FAIL_ON(cur < head || cur > kring->rtail);
1690 } else { /* here rtail < rhead */
1691 /* we need head outside rtail .. rhead */
1692 NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1693
1694 /* two cases now: head <= rtail or head >= rhead */
1695 if (head <= kring->rtail) {
1696 /* want head <= cur <= rtail */
1697 NM_FAIL_ON(cur < head || cur > kring->rtail);
1698 } else { /* head >= rhead */
1699 /* cur must be outside rtail..head */
1700 NM_FAIL_ON(cur > kring->rtail && cur < head);
1701 }
1702 }
1703 if (ring->tail != kring->rtail) {
1704 nm_prlim(5, "%s tail overwritten was %d need %d", kring->name,
1705 ring->tail, kring->rtail);
1706 ring->tail = kring->rtail;
1707 }
1708 kring->rhead = head;
1709 kring->rcur = cur;
1710 return head;
1711 }
1712
1713
1714 /*
1715 * validate parameters on entry for *_rxsync()
1716 * Returns ring->head if ok, kring->nkr_num_slots on error.
1717 *
1718 * For a valid configuration,
1719 * hwcur <= head <= cur <= tail <= hwtail
1720 *
1721 * We only consider head and cur.
1722 * hwcur and hwtail are reliable.
1723 *
1724 */
1725 u_int
nm_rxsync_prologue(struct netmap_kring * kring,struct netmap_ring * ring)1726 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1727 {
1728 uint32_t const n = kring->nkr_num_slots;
1729 uint32_t head, cur;
1730
1731 nm_prdis(5,"%s kc %d kt %d h %d c %d t %d",
1732 kring->name,
1733 kring->nr_hwcur, kring->nr_hwtail,
1734 ring->head, ring->cur, ring->tail);
1735 /*
1736 * Before storing the new values, we should check they do not
1737 * move backwards. However:
1738 * - head is not an issue because the previous value is hwcur;
1739 * - cur could in principle go back, however it does not matter
1740 * because we are processing a brand new rxsync()
1741 */
1742 cur = kring->rcur = ring->cur; /* read only once */
1743 head = kring->rhead = ring->head; /* read only once */
1744 #if 1 /* kernel sanity checks */
1745 NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1746 #endif /* kernel sanity checks */
1747 /* user sanity checks */
1748 if (kring->nr_hwtail >= kring->nr_hwcur) {
1749 /* want hwcur <= rhead <= hwtail */
1750 NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1751 /* and also rhead <= rcur <= hwtail */
1752 NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1753 } else {
1754 /* we need rhead outside hwtail..hwcur */
1755 NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1756 /* two cases now: head <= hwtail or head >= hwcur */
1757 if (head <= kring->nr_hwtail) {
1758 /* want head <= cur <= hwtail */
1759 NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1760 } else {
1761 /* cur must be outside hwtail..head */
1762 NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1763 }
1764 }
1765 if (ring->tail != kring->rtail) {
1766 nm_prlim(5, "%s tail overwritten was %d need %d",
1767 kring->name,
1768 ring->tail, kring->rtail);
1769 ring->tail = kring->rtail;
1770 }
1771 return head;
1772 }
1773
1774
1775 /*
1776 * Error routine called when txsync/rxsync detects an error.
1777 * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1778 * Return 1 on reinit.
1779 *
1780 * This routine is only called by the upper half of the kernel.
1781 * It only reads hwcur (which is changed only by the upper half, too)
1782 * and hwtail (which may be changed by the lower half, but only on
1783 * a tx ring and only to increase it, so any error will be recovered
1784 * on the next call). For the above, we don't strictly need to call
1785 * it under lock.
1786 */
1787 int
netmap_ring_reinit(struct netmap_kring * kring)1788 netmap_ring_reinit(struct netmap_kring *kring)
1789 {
1790 struct netmap_ring *ring = kring->ring;
1791 u_int i, lim = kring->nkr_num_slots - 1;
1792 int errors = 0;
1793
1794 // XXX KASSERT nm_kr_tryget
1795 nm_prlim(10, "called for %s", kring->name);
1796 // XXX probably wrong to trust userspace
1797 kring->rhead = ring->head;
1798 kring->rcur = ring->cur;
1799 kring->rtail = ring->tail;
1800
1801 if (ring->cur > lim)
1802 errors++;
1803 if (ring->head > lim)
1804 errors++;
1805 if (ring->tail > lim)
1806 errors++;
1807 for (i = 0; i <= lim; i++) {
1808 u_int idx = ring->slot[i].buf_idx;
1809 u_int len = ring->slot[i].len;
1810 if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1811 nm_prlim(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1812 ring->slot[i].buf_idx = 0;
1813 ring->slot[i].len = 0;
1814 } else if (len > NETMAP_BUF_SIZE(kring->na)) {
1815 ring->slot[i].len = 0;
1816 nm_prlim(5, "bad len at slot %d idx %d len %d", i, idx, len);
1817 }
1818 }
1819 if (errors) {
1820 nm_prlim(10, "total %d errors", errors);
1821 nm_prlim(10, "%s reinit, cur %d -> %d tail %d -> %d",
1822 kring->name,
1823 ring->cur, kring->nr_hwcur,
1824 ring->tail, kring->nr_hwtail);
1825 ring->head = kring->rhead = kring->nr_hwcur;
1826 ring->cur = kring->rcur = kring->nr_hwcur;
1827 ring->tail = kring->rtail = kring->nr_hwtail;
1828 }
1829 return (errors ? 1 : 0);
1830 }
1831
1832 /* interpret the ringid and flags fields of an nmreq, by translating them
1833 * into a pair of intervals of ring indices:
1834 *
1835 * [priv->np_txqfirst, priv->np_txqlast) and
1836 * [priv->np_rxqfirst, priv->np_rxqlast)
1837 *
1838 */
1839 int
netmap_interp_ringid(struct netmap_priv_d * priv,struct nmreq_header * hdr)1840 netmap_interp_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1841 {
1842 struct netmap_adapter *na = priv->np_na;
1843 struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1844 int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1845 enum txrx t;
1846 u_int j;
1847 u_int nr_flags = reg->nr_flags, nr_mode = reg->nr_mode,
1848 nr_ringid = reg->nr_ringid;
1849
1850 for_rx_tx(t) {
1851 if (nr_flags & excluded_direction[t]) {
1852 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1853 continue;
1854 }
1855 switch (nr_mode) {
1856 case NR_REG_ALL_NIC:
1857 case NR_REG_NULL:
1858 priv->np_qfirst[t] = 0;
1859 priv->np_qlast[t] = nma_get_nrings(na, t);
1860 nm_prdis("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1861 priv->np_qfirst[t], priv->np_qlast[t]);
1862 break;
1863 case NR_REG_SW:
1864 case NR_REG_NIC_SW:
1865 if (!(na->na_flags & NAF_HOST_RINGS)) {
1866 nm_prerr("host rings not supported");
1867 return EINVAL;
1868 }
1869 priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1870 nma_get_nrings(na, t) : 0);
1871 priv->np_qlast[t] = netmap_all_rings(na, t);
1872 nm_prdis("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1873 nm_txrx2str(t),
1874 priv->np_qfirst[t], priv->np_qlast[t]);
1875 break;
1876 case NR_REG_ONE_NIC:
1877 if (nr_ringid >= na->num_tx_rings &&
1878 nr_ringid >= na->num_rx_rings) {
1879 nm_prerr("invalid ring id %d", nr_ringid);
1880 return EINVAL;
1881 }
1882 /* if not enough rings, use the first one */
1883 j = nr_ringid;
1884 if (j >= nma_get_nrings(na, t))
1885 j = 0;
1886 priv->np_qfirst[t] = j;
1887 priv->np_qlast[t] = j + 1;
1888 nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
1889 priv->np_qfirst[t], priv->np_qlast[t]);
1890 break;
1891 case NR_REG_ONE_SW:
1892 if (!(na->na_flags & NAF_HOST_RINGS)) {
1893 nm_prerr("host rings not supported");
1894 return EINVAL;
1895 }
1896 if (nr_ringid >= na->num_host_tx_rings &&
1897 nr_ringid >= na->num_host_rx_rings) {
1898 nm_prerr("invalid ring id %d", nr_ringid);
1899 return EINVAL;
1900 }
1901 /* if not enough rings, use the first one */
1902 j = nr_ringid;
1903 if (j >= nma_get_host_nrings(na, t))
1904 j = 0;
1905 priv->np_qfirst[t] = nma_get_nrings(na, t) + j;
1906 priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1;
1907 nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t),
1908 priv->np_qfirst[t], priv->np_qlast[t]);
1909 break;
1910 default:
1911 nm_prerr("invalid regif type %d", nr_mode);
1912 return EINVAL;
1913 }
1914 }
1915 priv->np_flags = nr_flags;
1916
1917 /* Allow transparent forwarding mode in the host --> nic
1918 * direction only if all the TX hw rings have been opened. */
1919 if (priv->np_qfirst[NR_TX] == 0 &&
1920 priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1921 priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1922 }
1923
1924 if (netmap_verbose) {
1925 nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1926 na->name,
1927 priv->np_qfirst[NR_TX],
1928 priv->np_qlast[NR_TX],
1929 priv->np_qfirst[NR_RX],
1930 priv->np_qlast[NR_RX],
1931 nr_ringid);
1932 }
1933 return 0;
1934 }
1935
1936
1937 /*
1938 * Set the ring ID. For devices with a single queue, a request
1939 * for all rings is the same as a single ring.
1940 */
1941 static int
netmap_set_ringid(struct netmap_priv_d * priv,struct nmreq_header * hdr)1942 netmap_set_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1943 {
1944 struct netmap_adapter *na = priv->np_na;
1945 struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1946 int error;
1947 enum txrx t;
1948
1949 error = netmap_interp_ringid(priv, hdr);
1950 if (error) {
1951 return error;
1952 }
1953
1954 priv->np_txpoll = (reg->nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1955
1956 /* optimization: count the users registered for more than
1957 * one ring, which are the ones sleeping on the global queue.
1958 * The default netmap_notify() callback will then
1959 * avoid signaling the global queue if nobody is using it
1960 */
1961 for_rx_tx(t) {
1962 if (nm_si_user(priv, t))
1963 na->si_users[t]++;
1964 }
1965 return 0;
1966 }
1967
1968 static void
netmap_unset_ringid(struct netmap_priv_d * priv)1969 netmap_unset_ringid(struct netmap_priv_d *priv)
1970 {
1971 struct netmap_adapter *na = priv->np_na;
1972 enum txrx t;
1973
1974 for_rx_tx(t) {
1975 if (nm_si_user(priv, t))
1976 na->si_users[t]--;
1977 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1978 }
1979 priv->np_flags = 0;
1980 priv->np_txpoll = 0;
1981 priv->np_kloop_state = 0;
1982 }
1983
1984 #define within_sel(p_, t_, i_) \
1985 ((i_) < (p_)->np_qlast[(t_)])
1986 #define nonempty_sel(p_, t_) \
1987 (within_sel((p_), (t_), (p_)->np_qfirst[(t_)]))
1988 #define foreach_selected_ring(p_, t_, i_, kring_) \
1989 for ((t_) = nonempty_sel((p_), NR_RX) ? NR_RX : NR_TX, \
1990 (i_) = (p_)->np_qfirst[(t_)]; \
1991 (t_ == NR_RX || \
1992 (t == NR_TX && within_sel((p_), (t_), (i_)))) && \
1993 ((kring_) = NMR((p_)->np_na, (t_))[(i_)]); \
1994 (i_) = within_sel((p_), (t_), (i_) + 1) ? (i_) + 1 : \
1995 (++(t_) < NR_TXRX ? (p_)->np_qfirst[(t_)] : (i_)))
1996
1997
1998 /* Set the nr_pending_mode for the requested rings.
1999 * If requested, also try to get exclusive access to the rings, provided
2000 * the rings we want to bind are not exclusively owned by a previous bind.
2001 */
2002 static int
netmap_krings_get(struct netmap_priv_d * priv)2003 netmap_krings_get(struct netmap_priv_d *priv)
2004 {
2005 struct netmap_adapter *na = priv->np_na;
2006 u_int i;
2007 struct netmap_kring *kring;
2008 int excl = (priv->np_flags & NR_EXCLUSIVE);
2009 enum txrx t;
2010
2011 if (netmap_debug & NM_DEBUG_ON)
2012 nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
2013 na->name,
2014 priv->np_qfirst[NR_TX],
2015 priv->np_qlast[NR_TX],
2016 priv->np_qfirst[NR_RX],
2017 priv->np_qlast[NR_RX]);
2018
2019 /* first round: check that all the requested rings
2020 * are neither alread exclusively owned, nor we
2021 * want exclusive ownership when they are already in use
2022 */
2023 foreach_selected_ring(priv, t, i, kring) {
2024 if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
2025 (kring->users && excl))
2026 {
2027 nm_prdis("ring %s busy", kring->name);
2028 return EBUSY;
2029 }
2030 }
2031
2032 /* second round: increment usage count (possibly marking them
2033 * as exclusive) and set the nr_pending_mode
2034 */
2035 foreach_selected_ring(priv, t, i, kring) {
2036 kring->users++;
2037 if (excl)
2038 kring->nr_kflags |= NKR_EXCLUSIVE;
2039 kring->nr_pending_mode = NKR_NETMAP_ON;
2040 }
2041
2042 return 0;
2043
2044 }
2045
2046 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
2047 * if was asked on regif, and unset the nr_pending_mode if we are the
2048 * last users of the involved rings. */
2049 static void
netmap_krings_put(struct netmap_priv_d * priv)2050 netmap_krings_put(struct netmap_priv_d *priv)
2051 {
2052 u_int i;
2053 struct netmap_kring *kring;
2054 int excl = (priv->np_flags & NR_EXCLUSIVE);
2055 enum txrx t;
2056
2057 nm_prdis("%s: releasing tx [%d, %d) rx [%d, %d)",
2058 na->name,
2059 priv->np_qfirst[NR_TX],
2060 priv->np_qlast[NR_TX],
2061 priv->np_qfirst[NR_RX],
2062 priv->np_qlast[MR_RX]);
2063
2064 foreach_selected_ring(priv, t, i, kring) {
2065 if (excl)
2066 kring->nr_kflags &= ~NKR_EXCLUSIVE;
2067 kring->users--;
2068 if (kring->users == 0)
2069 kring->nr_pending_mode = NKR_NETMAP_OFF;
2070 }
2071 }
2072
2073 static int
nm_priv_rx_enabled(struct netmap_priv_d * priv)2074 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2075 {
2076 return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2077 }
2078
2079 /* Validate the CSB entries for both directions (atok and ktoa).
2080 * To be called under NMG_LOCK(). */
2081 static int
netmap_csb_validate(struct netmap_priv_d * priv,struct nmreq_opt_csb * csbo)2082 netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2083 {
2084 struct nm_csb_atok *csb_atok_base =
2085 (struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2086 struct nm_csb_ktoa *csb_ktoa_base =
2087 (struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2088 enum txrx t;
2089 int num_rings[NR_TXRX], tot_rings;
2090 size_t entry_size[2];
2091 void *csb_start[2];
2092 int i;
2093
2094 if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2095 nm_prerr("Cannot update CSB while kloop is running");
2096 return EBUSY;
2097 }
2098
2099 tot_rings = 0;
2100 for_rx_tx(t) {
2101 num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2102 tot_rings += num_rings[t];
2103 }
2104 if (tot_rings <= 0)
2105 return 0;
2106
2107 if (!(priv->np_flags & NR_EXCLUSIVE)) {
2108 nm_prerr("CSB mode requires NR_EXCLUSIVE");
2109 return EINVAL;
2110 }
2111
2112 entry_size[0] = sizeof(*csb_atok_base);
2113 entry_size[1] = sizeof(*csb_ktoa_base);
2114 csb_start[0] = (void *)csb_atok_base;
2115 csb_start[1] = (void *)csb_ktoa_base;
2116
2117 for (i = 0; i < 2; i++) {
2118 /* On Linux we could use access_ok() to simplify
2119 * the validation. However, the advantage of
2120 * this approach is that it works also on
2121 * FreeBSD. */
2122 size_t csb_size = tot_rings * entry_size[i];
2123 void *tmp;
2124 int err;
2125
2126 if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2127 nm_prerr("Unaligned CSB address");
2128 return EINVAL;
2129 }
2130
2131 tmp = nm_os_malloc(csb_size);
2132 if (!tmp)
2133 return ENOMEM;
2134 if (i == 0) {
2135 /* Application --> kernel direction. */
2136 err = copyin(csb_start[i], tmp, csb_size);
2137 } else {
2138 /* Kernel --> application direction. */
2139 memset(tmp, 0, csb_size);
2140 err = copyout(tmp, csb_start[i], csb_size);
2141 }
2142 nm_os_free(tmp);
2143 if (err) {
2144 nm_prerr("Invalid CSB address");
2145 return err;
2146 }
2147 }
2148
2149 priv->np_csb_atok_base = csb_atok_base;
2150 priv->np_csb_ktoa_base = csb_ktoa_base;
2151
2152 /* Initialize the CSB. */
2153 for_rx_tx(t) {
2154 for (i = 0; i < num_rings[t]; i++) {
2155 struct netmap_kring *kring =
2156 NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2157 struct nm_csb_atok *csb_atok = csb_atok_base + i;
2158 struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2159
2160 if (t == NR_RX) {
2161 csb_atok += num_rings[NR_TX];
2162 csb_ktoa += num_rings[NR_TX];
2163 }
2164
2165 CSB_WRITE(csb_atok, head, kring->rhead);
2166 CSB_WRITE(csb_atok, cur, kring->rcur);
2167 CSB_WRITE(csb_atok, appl_need_kick, 1);
2168 CSB_WRITE(csb_atok, sync_flags, 1);
2169 CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2170 CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2171 CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2172
2173 nm_prinf("csb_init for kring %s: head %u, cur %u, "
2174 "hwcur %u, hwtail %u", kring->name,
2175 kring->rhead, kring->rcur, kring->nr_hwcur,
2176 kring->nr_hwtail);
2177 }
2178 }
2179
2180 return 0;
2181 }
2182
2183 /* Ensure that the netmap adapter can support the given MTU.
2184 * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2185 */
2186 int
netmap_buf_size_validate(const struct netmap_adapter * na,unsigned mtu)2187 netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2188 unsigned nbs = NETMAP_BUF_SIZE(na);
2189
2190 if (mtu <= na->rx_buf_maxsize) {
2191 /* The MTU fits a single NIC slot. We only
2192 * Need to check that netmap buffers are
2193 * large enough to hold an MTU. NS_MOREFRAG
2194 * cannot be used in this case. */
2195 if (nbs < mtu) {
2196 nm_prerr("error: netmap buf size (%u) "
2197 "< device MTU (%u)", nbs, mtu);
2198 return EINVAL;
2199 }
2200 } else {
2201 /* More NIC slots may be needed to receive
2202 * or transmit a single packet. Check that
2203 * the adapter supports NS_MOREFRAG and that
2204 * netmap buffers are large enough to hold
2205 * the maximum per-slot size. */
2206 if (!(na->na_flags & NAF_MOREFRAG)) {
2207 nm_prerr("error: large MTU (%d) needed "
2208 "but %s does not support "
2209 "NS_MOREFRAG", mtu,
2210 na->ifp->if_xname);
2211 return EINVAL;
2212 } else if (nbs < na->rx_buf_maxsize) {
2213 nm_prerr("error: using NS_MOREFRAG on "
2214 "%s requires netmap buf size "
2215 ">= %u", na->ifp->if_xname,
2216 na->rx_buf_maxsize);
2217 return EINVAL;
2218 } else {
2219 nm_prinf("info: netmap application on "
2220 "%s needs to support "
2221 "NS_MOREFRAG "
2222 "(MTU=%u,netmap_buf_size=%u)",
2223 na->ifp->if_xname, mtu, nbs);
2224 }
2225 }
2226 return 0;
2227 }
2228
2229
2230 /*
2231 * possibly move the interface to netmap-mode.
2232 * If success it returns a pointer to netmap_if, otherwise NULL.
2233 * This must be called with NMG_LOCK held.
2234 *
2235 * The following na callbacks are called in the process:
2236 *
2237 * na->nm_config() [by netmap_update_config]
2238 * (get current number and size of rings)
2239 *
2240 * We have a generic one for linux (netmap_linux_config).
2241 * The bwrap has to override this, since it has to forward
2242 * the request to the wrapped adapter (netmap_bwrap_config).
2243 *
2244 *
2245 * na->nm_krings_create()
2246 * (create and init the krings array)
2247 *
2248 * One of the following:
2249 *
2250 * * netmap_hw_krings_create, (hw ports)
2251 * creates the standard layout for the krings
2252 * and adds the mbq (used for the host rings).
2253 *
2254 * * netmap_vp_krings_create (VALE ports)
2255 * add leases and scratchpads
2256 *
2257 * * netmap_pipe_krings_create (pipes)
2258 * create the krings and rings of both ends and
2259 * cross-link them
2260 *
2261 * * netmap_monitor_krings_create (monitors)
2262 * avoid allocating the mbq
2263 *
2264 * * netmap_bwrap_krings_create (bwraps)
2265 * create both the brap krings array,
2266 * the krings array of the wrapped adapter, and
2267 * (if needed) the fake array for the host adapter
2268 *
2269 * na->nm_register(, 1)
2270 * (put the adapter in netmap mode)
2271 *
2272 * This may be one of the following:
2273 *
2274 * * netmap_hw_reg (hw ports)
2275 * checks that the ifp is still there, then calls
2276 * the hardware specific callback;
2277 *
2278 * * netmap_vp_reg (VALE ports)
2279 * If the port is connected to a bridge,
2280 * set the NAF_NETMAP_ON flag under the
2281 * bridge write lock.
2282 *
2283 * * netmap_pipe_reg (pipes)
2284 * inform the other pipe end that it is no
2285 * longer responsible for the lifetime of this
2286 * pipe end
2287 *
2288 * * netmap_monitor_reg (monitors)
2289 * intercept the sync callbacks of the monitored
2290 * rings
2291 *
2292 * * netmap_bwrap_reg (bwraps)
2293 * cross-link the bwrap and hwna rings,
2294 * forward the request to the hwna, override
2295 * the hwna notify callback (to get the frames
2296 * coming from outside go through the bridge).
2297 *
2298 *
2299 */
2300 int
netmap_do_regif(struct netmap_priv_d * priv,struct netmap_adapter * na,struct nmreq_header * hdr)2301 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2302 struct nmreq_header *hdr)
2303 {
2304 struct netmap_if *nifp = NULL;
2305 int error;
2306
2307 NMG_LOCK_ASSERT();
2308 priv->np_na = na; /* store the reference */
2309 error = netmap_mem_finalize(na->nm_mem, na);
2310 if (error)
2311 goto err;
2312
2313 if (na->active_fds == 0) {
2314
2315 /* cache the allocator info in the na */
2316 error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2317 if (error)
2318 goto err_drop_mem;
2319 nm_prdis("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2320 na->na_lut.objsize);
2321
2322 /* ring configuration may have changed, fetch from the card */
2323 netmap_update_config(na);
2324 }
2325
2326 /* compute the range of tx and rx rings to monitor */
2327 error = netmap_set_ringid(priv, hdr);
2328 if (error)
2329 goto err_put_lut;
2330
2331 if (na->active_fds == 0) {
2332 /*
2333 * If this is the first registration of the adapter,
2334 * perform sanity checks and create the in-kernel view
2335 * of the netmap rings (the netmap krings).
2336 */
2337 if (na->ifp && nm_priv_rx_enabled(priv)) {
2338 /* This netmap adapter is attached to an ifnet. */
2339 unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2340
2341 nm_prdis("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2342 na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2343
2344 if (na->rx_buf_maxsize == 0) {
2345 nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2346 error = EIO;
2347 goto err_drop_mem;
2348 }
2349
2350 error = netmap_buf_size_validate(na, mtu);
2351 if (error)
2352 goto err_drop_mem;
2353 }
2354
2355 /*
2356 * Depending on the adapter, this may also create
2357 * the netmap rings themselves
2358 */
2359 error = na->nm_krings_create(na);
2360 if (error)
2361 goto err_put_lut;
2362
2363 }
2364
2365 /* now the krings must exist and we can check whether some
2366 * previous bind has exclusive ownership on them, and set
2367 * nr_pending_mode
2368 */
2369 error = netmap_krings_get(priv);
2370 if (error)
2371 goto err_del_krings;
2372
2373 /* create all needed missing netmap rings */
2374 error = netmap_mem_rings_create(na);
2375 if (error)
2376 goto err_rel_excl;
2377
2378 /* in all cases, create a new netmap if */
2379 nifp = netmap_mem_if_new(na, priv);
2380 if (nifp == NULL) {
2381 error = ENOMEM;
2382 goto err_rel_excl;
2383 }
2384
2385 if (nm_kring_pending(priv)) {
2386 /* Some kring is switching mode, tell the adapter to
2387 * react on this. */
2388 error = na->nm_register(na, 1);
2389 if (error)
2390 goto err_del_if;
2391 }
2392
2393 /* Commit the reference. */
2394 na->active_fds++;
2395
2396 /*
2397 * advertise that the interface is ready by setting np_nifp.
2398 * The barrier is needed because readers (poll, *SYNC and mmap)
2399 * check for priv->np_nifp != NULL without locking
2400 */
2401 mb(); /* make sure previous writes are visible to all CPUs */
2402 priv->np_nifp = nifp;
2403
2404 return 0;
2405
2406 err_del_if:
2407 netmap_mem_if_delete(na, nifp);
2408 err_rel_excl:
2409 netmap_krings_put(priv);
2410 netmap_mem_rings_delete(na);
2411 err_del_krings:
2412 if (na->active_fds == 0)
2413 na->nm_krings_delete(na);
2414 err_put_lut:
2415 if (na->active_fds == 0)
2416 memset(&na->na_lut, 0, sizeof(na->na_lut));
2417 err_drop_mem:
2418 netmap_mem_drop(na);
2419 err:
2420 priv->np_na = NULL;
2421 return error;
2422 }
2423
2424
2425 /*
2426 * update kring and ring at the end of rxsync/txsync.
2427 */
2428 static inline void
nm_sync_finalize(struct netmap_kring * kring)2429 nm_sync_finalize(struct netmap_kring *kring)
2430 {
2431 /*
2432 * Update ring tail to what the kernel knows
2433 * After txsync: head/rhead/hwcur might be behind cur/rcur
2434 * if no carrier.
2435 */
2436 kring->ring->tail = kring->rtail = kring->nr_hwtail;
2437
2438 nm_prdis(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2439 kring->name, kring->nr_hwcur, kring->nr_hwtail,
2440 kring->rhead, kring->rcur, kring->rtail);
2441 }
2442
2443 /* set ring timestamp */
2444 static inline void
ring_timestamp_set(struct netmap_ring * ring)2445 ring_timestamp_set(struct netmap_ring *ring)
2446 {
2447 if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2448 microtime(&ring->ts);
2449 }
2450 }
2451
2452 static int nmreq_copyin(struct nmreq_header *, int);
2453 static int nmreq_copyout(struct nmreq_header *, int);
2454 static int nmreq_checkoptions(struct nmreq_header *);
2455
2456 /*
2457 * ioctl(2) support for the "netmap" device.
2458 *
2459 * Following a list of accepted commands:
2460 * - NIOCCTRL device control API
2461 * - NIOCTXSYNC sync TX rings
2462 * - NIOCRXSYNC sync RX rings
2463 * - SIOCGIFADDR just for convenience
2464 * - NIOCGINFO deprecated (legacy API)
2465 * - NIOCREGIF deprecated (legacy API)
2466 *
2467 * Return 0 on success, errno otherwise.
2468 */
2469 int
netmap_ioctl(struct netmap_priv_d * priv,u_long cmd,caddr_t data,struct thread * td,int nr_body_is_user)2470 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2471 struct thread *td, int nr_body_is_user)
2472 {
2473 struct mbq q; /* packets from RX hw queues to host stack */
2474 struct netmap_adapter *na = NULL;
2475 struct netmap_mem_d *nmd = NULL;
2476 struct ifnet *ifp = NULL;
2477 int error = 0;
2478 u_int i, qfirst, qlast;
2479 struct netmap_kring **krings;
2480 int sync_flags;
2481 enum txrx t;
2482
2483 switch (cmd) {
2484 case NIOCCTRL: {
2485 struct nmreq_header *hdr = (struct nmreq_header *)data;
2486
2487 if (hdr->nr_version < NETMAP_MIN_API ||
2488 hdr->nr_version > NETMAP_MAX_API) {
2489 nm_prerr("API mismatch: got %d need %d",
2490 hdr->nr_version, NETMAP_API);
2491 return EINVAL;
2492 }
2493
2494 /* Make a kernel-space copy of the user-space nr_body.
2495 * For convenince, the nr_body pointer and the pointers
2496 * in the options list will be replaced with their
2497 * kernel-space counterparts. The original pointers are
2498 * saved internally and later restored by nmreq_copyout
2499 */
2500 error = nmreq_copyin(hdr, nr_body_is_user);
2501 if (error) {
2502 return error;
2503 }
2504
2505 /* Sanitize hdr->nr_name. */
2506 hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2507
2508 switch (hdr->nr_reqtype) {
2509 case NETMAP_REQ_REGISTER: {
2510 struct nmreq_register *req =
2511 (struct nmreq_register *)(uintptr_t)hdr->nr_body;
2512 struct netmap_if *nifp;
2513
2514 /* Protect access to priv from concurrent requests. */
2515 NMG_LOCK();
2516 do {
2517 struct nmreq_option *opt;
2518 u_int memflags;
2519
2520 if (priv->np_nifp != NULL) { /* thread already registered */
2521 error = EBUSY;
2522 break;
2523 }
2524
2525 #ifdef WITH_EXTMEM
2526 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_EXTMEM);
2527 if (opt != NULL) {
2528 struct nmreq_opt_extmem *e =
2529 (struct nmreq_opt_extmem *)opt;
2530
2531 nmd = netmap_mem_ext_create(e->nro_usrptr,
2532 &e->nro_info, &error);
2533 opt->nro_status = error;
2534 if (nmd == NULL)
2535 break;
2536 }
2537 #endif /* WITH_EXTMEM */
2538
2539 if (nmd == NULL && req->nr_mem_id) {
2540 /* find the allocator and get a reference */
2541 nmd = netmap_mem_find(req->nr_mem_id);
2542 if (nmd == NULL) {
2543 if (netmap_verbose) {
2544 nm_prerr("%s: failed to find mem_id %u",
2545 hdr->nr_name, req->nr_mem_id);
2546 }
2547 error = EINVAL;
2548 break;
2549 }
2550 }
2551 /* find the interface and a reference */
2552 error = netmap_get_na(hdr, &na, &ifp, nmd,
2553 1 /* create */); /* keep reference */
2554 if (error)
2555 break;
2556 if (NETMAP_OWNED_BY_KERN(na)) {
2557 error = EBUSY;
2558 break;
2559 }
2560
2561 if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2562 nm_prerr("virt_hdr_len=%d, but application does "
2563 "not accept it", na->virt_hdr_len);
2564 error = EIO;
2565 break;
2566 }
2567
2568 error = netmap_do_regif(priv, na, hdr);
2569 if (error) { /* reg. failed, release priv and ref */
2570 break;
2571 }
2572
2573 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2574 if (opt != NULL) {
2575 struct nmreq_opt_csb *csbo =
2576 (struct nmreq_opt_csb *)opt;
2577 error = netmap_csb_validate(priv, csbo);
2578 opt->nro_status = error;
2579 if (error) {
2580 netmap_do_unregif(priv);
2581 break;
2582 }
2583 }
2584
2585 nifp = priv->np_nifp;
2586
2587 /* return the offset of the netmap_if object */
2588 req->nr_rx_rings = na->num_rx_rings;
2589 req->nr_tx_rings = na->num_tx_rings;
2590 req->nr_rx_slots = na->num_rx_desc;
2591 req->nr_tx_slots = na->num_tx_desc;
2592 req->nr_host_tx_rings = na->num_host_tx_rings;
2593 req->nr_host_rx_rings = na->num_host_rx_rings;
2594 error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2595 &req->nr_mem_id);
2596 if (error) {
2597 netmap_do_unregif(priv);
2598 break;
2599 }
2600 if (memflags & NETMAP_MEM_PRIVATE) {
2601 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2602 }
2603 for_rx_tx(t) {
2604 priv->np_si[t] = nm_si_user(priv, t) ?
2605 &na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2606 }
2607
2608 if (req->nr_extra_bufs) {
2609 if (netmap_verbose)
2610 nm_prinf("requested %d extra buffers",
2611 req->nr_extra_bufs);
2612 req->nr_extra_bufs = netmap_extra_alloc(na,
2613 &nifp->ni_bufs_head, req->nr_extra_bufs);
2614 if (netmap_verbose)
2615 nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2616 }
2617 req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2618
2619 error = nmreq_checkoptions(hdr);
2620 if (error) {
2621 netmap_do_unregif(priv);
2622 break;
2623 }
2624
2625 /* store ifp reference so that priv destructor may release it */
2626 priv->np_ifp = ifp;
2627 } while (0);
2628 if (error) {
2629 netmap_unget_na(na, ifp);
2630 }
2631 /* release the reference from netmap_mem_find() or
2632 * netmap_mem_ext_create()
2633 */
2634 if (nmd)
2635 netmap_mem_put(nmd);
2636 NMG_UNLOCK();
2637 break;
2638 }
2639
2640 case NETMAP_REQ_PORT_INFO_GET: {
2641 struct nmreq_port_info_get *req =
2642 (struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2643 int nmd_ref = 0;
2644
2645 NMG_LOCK();
2646 do {
2647 u_int memflags;
2648
2649 if (hdr->nr_name[0] != '\0') {
2650 /* Build a nmreq_register out of the nmreq_port_info_get,
2651 * so that we can call netmap_get_na(). */
2652 struct nmreq_register regreq;
2653 bzero(®req, sizeof(regreq));
2654 regreq.nr_mode = NR_REG_ALL_NIC;
2655 regreq.nr_tx_slots = req->nr_tx_slots;
2656 regreq.nr_rx_slots = req->nr_rx_slots;
2657 regreq.nr_tx_rings = req->nr_tx_rings;
2658 regreq.nr_rx_rings = req->nr_rx_rings;
2659 regreq.nr_host_tx_rings = req->nr_host_tx_rings;
2660 regreq.nr_host_rx_rings = req->nr_host_rx_rings;
2661 regreq.nr_mem_id = req->nr_mem_id;
2662
2663 /* get a refcount */
2664 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2665 hdr->nr_body = (uintptr_t)®req;
2666 error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2667 hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2668 hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2669 if (error) {
2670 na = NULL;
2671 ifp = NULL;
2672 break;
2673 }
2674 nmd = na->nm_mem; /* get memory allocator */
2675 } else {
2676 nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2677 if (nmd == NULL) {
2678 if (netmap_verbose)
2679 nm_prerr("%s: failed to find mem_id %u",
2680 hdr->nr_name,
2681 req->nr_mem_id ? req->nr_mem_id : 1);
2682 error = EINVAL;
2683 break;
2684 }
2685 nmd_ref = 1;
2686 }
2687
2688 error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2689 &req->nr_mem_id);
2690 if (error)
2691 break;
2692 if (na == NULL) /* only memory info */
2693 break;
2694 netmap_update_config(na);
2695 req->nr_rx_rings = na->num_rx_rings;
2696 req->nr_tx_rings = na->num_tx_rings;
2697 req->nr_rx_slots = na->num_rx_desc;
2698 req->nr_tx_slots = na->num_tx_desc;
2699 req->nr_host_tx_rings = na->num_host_tx_rings;
2700 req->nr_host_rx_rings = na->num_host_rx_rings;
2701 } while (0);
2702 netmap_unget_na(na, ifp);
2703 if (nmd_ref)
2704 netmap_mem_put(nmd);
2705 NMG_UNLOCK();
2706 break;
2707 }
2708 #ifdef WITH_VALE
2709 case NETMAP_REQ_VALE_ATTACH: {
2710 error = netmap_vale_attach(hdr, NULL /* userspace request */);
2711 break;
2712 }
2713
2714 case NETMAP_REQ_VALE_DETACH: {
2715 error = netmap_vale_detach(hdr, NULL /* userspace request */);
2716 break;
2717 }
2718
2719 case NETMAP_REQ_VALE_LIST: {
2720 error = netmap_vale_list(hdr);
2721 break;
2722 }
2723
2724 case NETMAP_REQ_PORT_HDR_SET: {
2725 struct nmreq_port_hdr *req =
2726 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2727 /* Build a nmreq_register out of the nmreq_port_hdr,
2728 * so that we can call netmap_get_bdg_na(). */
2729 struct nmreq_register regreq;
2730 bzero(®req, sizeof(regreq));
2731 regreq.nr_mode = NR_REG_ALL_NIC;
2732
2733 /* For now we only support virtio-net headers, and only for
2734 * VALE ports, but this may change in future. Valid lengths
2735 * for the virtio-net header are 0 (no header), 10 and 12. */
2736 if (req->nr_hdr_len != 0 &&
2737 req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2738 req->nr_hdr_len != 12) {
2739 if (netmap_verbose)
2740 nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
2741 error = EINVAL;
2742 break;
2743 }
2744 NMG_LOCK();
2745 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2746 hdr->nr_body = (uintptr_t)®req;
2747 error = netmap_get_vale_na(hdr, &na, NULL, 0);
2748 hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2749 hdr->nr_body = (uintptr_t)req;
2750 if (na && !error) {
2751 struct netmap_vp_adapter *vpna =
2752 (struct netmap_vp_adapter *)na;
2753 na->virt_hdr_len = req->nr_hdr_len;
2754 if (na->virt_hdr_len) {
2755 vpna->mfs = NETMAP_BUF_SIZE(na);
2756 }
2757 if (netmap_verbose)
2758 nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2759 netmap_adapter_put(na);
2760 } else if (!na) {
2761 error = ENXIO;
2762 }
2763 NMG_UNLOCK();
2764 break;
2765 }
2766
2767 case NETMAP_REQ_PORT_HDR_GET: {
2768 /* Get vnet-header length for this netmap port */
2769 struct nmreq_port_hdr *req =
2770 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2771 /* Build a nmreq_register out of the nmreq_port_hdr,
2772 * so that we can call netmap_get_bdg_na(). */
2773 struct nmreq_register regreq;
2774 struct ifnet *ifp;
2775
2776 bzero(®req, sizeof(regreq));
2777 regreq.nr_mode = NR_REG_ALL_NIC;
2778 NMG_LOCK();
2779 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2780 hdr->nr_body = (uintptr_t)®req;
2781 error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2782 hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2783 hdr->nr_body = (uintptr_t)req;
2784 if (na && !error) {
2785 req->nr_hdr_len = na->virt_hdr_len;
2786 }
2787 netmap_unget_na(na, ifp);
2788 NMG_UNLOCK();
2789 break;
2790 }
2791
2792 case NETMAP_REQ_VALE_NEWIF: {
2793 error = nm_vi_create(hdr);
2794 break;
2795 }
2796
2797 case NETMAP_REQ_VALE_DELIF: {
2798 error = nm_vi_destroy(hdr->nr_name);
2799 break;
2800 }
2801
2802 case NETMAP_REQ_VALE_POLLING_ENABLE:
2803 case NETMAP_REQ_VALE_POLLING_DISABLE: {
2804 error = nm_bdg_polling(hdr);
2805 break;
2806 }
2807 #endif /* WITH_VALE */
2808 case NETMAP_REQ_POOLS_INFO_GET: {
2809 /* Get information from the memory allocator used for
2810 * hdr->nr_name. */
2811 struct nmreq_pools_info *req =
2812 (struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2813 NMG_LOCK();
2814 do {
2815 /* Build a nmreq_register out of the nmreq_pools_info,
2816 * so that we can call netmap_get_na(). */
2817 struct nmreq_register regreq;
2818 bzero(®req, sizeof(regreq));
2819 regreq.nr_mem_id = req->nr_mem_id;
2820 regreq.nr_mode = NR_REG_ALL_NIC;
2821
2822 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2823 hdr->nr_body = (uintptr_t)®req;
2824 error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2825 hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
2826 hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2827 if (error) {
2828 na = NULL;
2829 ifp = NULL;
2830 break;
2831 }
2832 nmd = na->nm_mem; /* grab the memory allocator */
2833 if (nmd == NULL) {
2834 error = EINVAL;
2835 break;
2836 }
2837
2838 /* Finalize the memory allocator, get the pools
2839 * information and release the allocator. */
2840 error = netmap_mem_finalize(nmd, na);
2841 if (error) {
2842 break;
2843 }
2844 error = netmap_mem_pools_info_get(req, nmd);
2845 netmap_mem_drop(na);
2846 } while (0);
2847 netmap_unget_na(na, ifp);
2848 NMG_UNLOCK();
2849 break;
2850 }
2851
2852 case NETMAP_REQ_CSB_ENABLE: {
2853 struct nmreq_option *opt;
2854
2855 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2856 if (opt == NULL) {
2857 error = EINVAL;
2858 } else {
2859 struct nmreq_opt_csb *csbo =
2860 (struct nmreq_opt_csb *)opt;
2861 NMG_LOCK();
2862 error = netmap_csb_validate(priv, csbo);
2863 NMG_UNLOCK();
2864 opt->nro_status = error;
2865 }
2866 break;
2867 }
2868
2869 case NETMAP_REQ_SYNC_KLOOP_START: {
2870 error = netmap_sync_kloop(priv, hdr);
2871 break;
2872 }
2873
2874 case NETMAP_REQ_SYNC_KLOOP_STOP: {
2875 error = netmap_sync_kloop_stop(priv);
2876 break;
2877 }
2878
2879 default: {
2880 error = EINVAL;
2881 break;
2882 }
2883 }
2884 /* Write back request body to userspace and reset the
2885 * user-space pointer. */
2886 error = nmreq_copyout(hdr, error);
2887 break;
2888 }
2889
2890 case NIOCTXSYNC:
2891 case NIOCRXSYNC: {
2892 if (unlikely(priv->np_nifp == NULL)) {
2893 error = ENXIO;
2894 break;
2895 }
2896 mb(); /* make sure following reads are not from cache */
2897
2898 if (unlikely(priv->np_csb_atok_base)) {
2899 nm_prerr("Invalid sync in CSB mode");
2900 error = EBUSY;
2901 break;
2902 }
2903
2904 na = priv->np_na; /* we have a reference */
2905
2906 mbq_init(&q);
2907 t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2908 krings = NMR(na, t);
2909 qfirst = priv->np_qfirst[t];
2910 qlast = priv->np_qlast[t];
2911 sync_flags = priv->np_sync_flags;
2912
2913 for (i = qfirst; i < qlast; i++) {
2914 struct netmap_kring *kring = krings[i];
2915 struct netmap_ring *ring = kring->ring;
2916
2917 if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2918 error = (error ? EIO : 0);
2919 continue;
2920 }
2921
2922 if (cmd == NIOCTXSYNC) {
2923 if (netmap_debug & NM_DEBUG_TXSYNC)
2924 nm_prinf("pre txsync ring %d cur %d hwcur %d",
2925 i, ring->cur,
2926 kring->nr_hwcur);
2927 if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2928 netmap_ring_reinit(kring);
2929 } else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2930 nm_sync_finalize(kring);
2931 }
2932 if (netmap_debug & NM_DEBUG_TXSYNC)
2933 nm_prinf("post txsync ring %d cur %d hwcur %d",
2934 i, ring->cur,
2935 kring->nr_hwcur);
2936 } else {
2937 if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2938 netmap_ring_reinit(kring);
2939 }
2940 if (nm_may_forward_up(kring)) {
2941 /* transparent forwarding, see netmap_poll() */
2942 netmap_grab_packets(kring, &q, netmap_fwd);
2943 }
2944 if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2945 nm_sync_finalize(kring);
2946 }
2947 ring_timestamp_set(ring);
2948 }
2949 nm_kr_put(kring);
2950 }
2951
2952 if (mbq_peek(&q)) {
2953 netmap_send_up(na->ifp, &q);
2954 }
2955
2956 break;
2957 }
2958
2959 default: {
2960 return netmap_ioctl_legacy(priv, cmd, data, td);
2961 break;
2962 }
2963 }
2964
2965 return (error);
2966 }
2967
2968 size_t
nmreq_size_by_type(uint16_t nr_reqtype)2969 nmreq_size_by_type(uint16_t nr_reqtype)
2970 {
2971 switch (nr_reqtype) {
2972 case NETMAP_REQ_REGISTER:
2973 return sizeof(struct nmreq_register);
2974 case NETMAP_REQ_PORT_INFO_GET:
2975 return sizeof(struct nmreq_port_info_get);
2976 case NETMAP_REQ_VALE_ATTACH:
2977 return sizeof(struct nmreq_vale_attach);
2978 case NETMAP_REQ_VALE_DETACH:
2979 return sizeof(struct nmreq_vale_detach);
2980 case NETMAP_REQ_VALE_LIST:
2981 return sizeof(struct nmreq_vale_list);
2982 case NETMAP_REQ_PORT_HDR_SET:
2983 case NETMAP_REQ_PORT_HDR_GET:
2984 return sizeof(struct nmreq_port_hdr);
2985 case NETMAP_REQ_VALE_NEWIF:
2986 return sizeof(struct nmreq_vale_newif);
2987 case NETMAP_REQ_VALE_DELIF:
2988 case NETMAP_REQ_SYNC_KLOOP_STOP:
2989 case NETMAP_REQ_CSB_ENABLE:
2990 return 0;
2991 case NETMAP_REQ_VALE_POLLING_ENABLE:
2992 case NETMAP_REQ_VALE_POLLING_DISABLE:
2993 return sizeof(struct nmreq_vale_polling);
2994 case NETMAP_REQ_POOLS_INFO_GET:
2995 return sizeof(struct nmreq_pools_info);
2996 case NETMAP_REQ_SYNC_KLOOP_START:
2997 return sizeof(struct nmreq_sync_kloop_start);
2998 }
2999 return 0;
3000 }
3001
3002 static size_t
nmreq_opt_size_by_type(uint32_t nro_reqtype,uint64_t nro_size)3003 nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
3004 {
3005 size_t rv = sizeof(struct nmreq_option);
3006 #ifdef NETMAP_REQ_OPT_DEBUG
3007 if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
3008 return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
3009 #endif /* NETMAP_REQ_OPT_DEBUG */
3010 switch (nro_reqtype) {
3011 #ifdef WITH_EXTMEM
3012 case NETMAP_REQ_OPT_EXTMEM:
3013 rv = sizeof(struct nmreq_opt_extmem);
3014 break;
3015 #endif /* WITH_EXTMEM */
3016 case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
3017 if (nro_size >= rv)
3018 rv = nro_size;
3019 break;
3020 case NETMAP_REQ_OPT_CSB:
3021 rv = sizeof(struct nmreq_opt_csb);
3022 break;
3023 case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
3024 rv = sizeof(struct nmreq_opt_sync_kloop_mode);
3025 break;
3026 }
3027 /* subtract the common header */
3028 return rv - sizeof(struct nmreq_option);
3029 }
3030
3031 /*
3032 * nmreq_copyin: create an in-kernel version of the request.
3033 *
3034 * We build the following data structure:
3035 *
3036 * hdr -> +-------+ buf
3037 * | | +---------------+
3038 * +-------+ |usr body ptr |
3039 * |options|-. +---------------+
3040 * +-------+ | |usr options ptr|
3041 * |body |--------->+---------------+
3042 * +-------+ | | |
3043 * | | copy of body |
3044 * | | |
3045 * | +---------------+
3046 * | | NULL |
3047 * | +---------------+
3048 * | .---| |\
3049 * | | +---------------+ |
3050 * | .------| | |
3051 * | | | +---------------+ \ option table
3052 * | | | | ... | / indexed by option
3053 * | | | +---------------+ | type
3054 * | | | | | |
3055 * | | | +---------------+/
3056 * | | | |usr next ptr 1 |
3057 * `-|----->+---------------+
3058 * | | | copy of opt 1 |
3059 * | | | |
3060 * | | .-| nro_next |
3061 * | | | +---------------+
3062 * | | | |usr next ptr 2 |
3063 * | `-`>+---------------+
3064 * | | copy of opt 2 |
3065 * | | |
3066 * | .-| nro_next |
3067 * | | +---------------+
3068 * | | | |
3069 * ~ ~ ~ ... ~
3070 * | .-| |
3071 * `----->+---------------+
3072 * | |usr next ptr n |
3073 * `>+---------------+
3074 * | copy of opt n |
3075 * | |
3076 * | nro_next(NULL)|
3077 * +---------------+
3078 *
3079 * The options and body fields of the hdr structure are overwritten
3080 * with in-kernel valid pointers inside the buf. The original user
3081 * pointers are saved in the buf and restored on copyout.
3082 * The list of options is copied and the pointers adjusted. The
3083 * original pointers are saved before the option they belonged.
3084 *
3085 * The option table has an entry for every availabe option. Entries
3086 * for options that have not been passed contain NULL.
3087 *
3088 */
3089
3090 int
nmreq_copyin(struct nmreq_header * hdr,int nr_body_is_user)3091 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
3092 {
3093 size_t rqsz, optsz, bufsz;
3094 int error = 0;
3095 char *ker = NULL, *p;
3096 struct nmreq_option **next, *src, **opt_tab;
3097 uint64_t *ptrs;
3098
3099 if (hdr->nr_reserved) {
3100 if (netmap_verbose)
3101 nm_prerr("nr_reserved must be zero");
3102 return EINVAL;
3103 }
3104
3105 if (!nr_body_is_user)
3106 return 0;
3107
3108 hdr->nr_reserved = nr_body_is_user;
3109
3110 /* compute the total size of the buffer */
3111 rqsz = nmreq_size_by_type(hdr->nr_reqtype);
3112 if (rqsz > NETMAP_REQ_MAXSIZE) {
3113 error = EMSGSIZE;
3114 goto out_err;
3115 }
3116 if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
3117 (!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3118 /* Request body expected, but not found; or
3119 * request body found but unexpected. */
3120 if (netmap_verbose)
3121 nm_prerr("nr_body expected but not found, or vice versa");
3122 error = EINVAL;
3123 goto out_err;
3124 }
3125
3126 /*
3127 * The buffer size must be large enough to store the request body,
3128 * all the possible options and the additional user pointers
3129 * (2+NETMAP_REQ_OPT_MAX). Note that the maximum size of body plus
3130 * options can not exceed NETMAP_REQ_MAXSIZE;
3131 */
3132 bufsz = (2 + NETMAP_REQ_OPT_MAX) * sizeof(void *) + NETMAP_REQ_MAXSIZE +
3133 NETMAP_REQ_OPT_MAX * sizeof(opt_tab);
3134
3135 ker = nm_os_malloc(bufsz);
3136 if (ker == NULL) {
3137 error = ENOMEM;
3138 goto out_err;
3139 }
3140 p = ker; /* write pointer into the buffer */
3141
3142 /* make a copy of the user pointers */
3143 ptrs = (uint64_t*)p;
3144 *ptrs++ = hdr->nr_body;
3145 *ptrs++ = hdr->nr_options;
3146 p = (char *)ptrs;
3147
3148 /* copy the body */
3149 error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3150 if (error)
3151 goto out_restore;
3152 /* overwrite the user pointer with the in-kernel one */
3153 hdr->nr_body = (uintptr_t)p;
3154 p += rqsz;
3155 /* start of the options table */
3156 opt_tab = (struct nmreq_option **)p;
3157 p += sizeof(opt_tab) * NETMAP_REQ_OPT_MAX;
3158
3159 /* copy the options */
3160 next = (struct nmreq_option **)&hdr->nr_options;
3161 src = *next;
3162 while (src) {
3163 struct nmreq_option *opt;
3164
3165 /* copy the option header */
3166 ptrs = (uint64_t *)p;
3167 opt = (struct nmreq_option *)(ptrs + 1);
3168 error = copyin(src, opt, sizeof(*src));
3169 if (error)
3170 goto out_restore;
3171 rqsz += sizeof(*src);
3172 /* make a copy of the user next pointer */
3173 *ptrs = opt->nro_next;
3174 /* overwrite the user pointer with the in-kernel one */
3175 *next = opt;
3176
3177 /* initialize the option as not supported.
3178 * Recognized options will update this field.
3179 */
3180 opt->nro_status = EOPNOTSUPP;
3181
3182 /* check for invalid types */
3183 if (opt->nro_reqtype < 1) {
3184 if (netmap_verbose)
3185 nm_prinf("invalid option type: %u", opt->nro_reqtype);
3186 opt->nro_status = EINVAL;
3187 error = EINVAL;
3188 goto next;
3189 }
3190
3191 if (opt->nro_reqtype >= NETMAP_REQ_OPT_MAX) {
3192 /* opt->nro_status is already EOPNOTSUPP */
3193 error = EOPNOTSUPP;
3194 goto next;
3195 }
3196
3197 /* if the type is valid, index the option in the table
3198 * unless it is a duplicate.
3199 */
3200 if (opt_tab[opt->nro_reqtype] != NULL) {
3201 if (netmap_verbose)
3202 nm_prinf("duplicate option: %u", opt->nro_reqtype);
3203 opt->nro_status = EINVAL;
3204 opt_tab[opt->nro_reqtype]->nro_status = EINVAL;
3205 error = EINVAL;
3206 goto next;
3207 }
3208 opt_tab[opt->nro_reqtype] = opt;
3209
3210 p = (char *)(opt + 1);
3211
3212 /* copy the option body */
3213 optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3214 opt->nro_size);
3215 /* check optsz and nro_size to avoid for possible integer overflows of rqsz */
3216 if ((optsz > NETMAP_REQ_MAXSIZE) || (opt->nro_size > NETMAP_REQ_MAXSIZE)
3217 || (rqsz + optsz > NETMAP_REQ_MAXSIZE)
3218 || (optsz > 0 && rqsz + optsz <= rqsz)) {
3219 error = EMSGSIZE;
3220 goto out_restore;
3221 }
3222 rqsz += optsz;
3223 if (optsz) {
3224 /* the option body follows the option header */
3225 error = copyin(src + 1, p, optsz);
3226 if (error)
3227 goto out_restore;
3228 p += optsz;
3229 }
3230
3231 next:
3232 /* move to next option */
3233 next = (struct nmreq_option **)&opt->nro_next;
3234 src = *next;
3235 }
3236 if (error)
3237 nmreq_copyout(hdr, error);
3238 return error;
3239
3240 out_restore:
3241 ptrs = (uint64_t *)ker;
3242 hdr->nr_body = *ptrs++;
3243 hdr->nr_options = *ptrs++;
3244 hdr->nr_reserved = 0;
3245 nm_os_free(ker);
3246 out_err:
3247 return error;
3248 }
3249
3250 static int
nmreq_copyout(struct nmreq_header * hdr,int rerror)3251 nmreq_copyout(struct nmreq_header *hdr, int rerror)
3252 {
3253 struct nmreq_option *src, *dst;
3254 void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3255 uint64_t *ptrs;
3256 size_t bodysz;
3257 int error;
3258
3259 if (!hdr->nr_reserved)
3260 return rerror;
3261
3262 /* restore the user pointers in the header */
3263 ptrs = (uint64_t *)ker - 2;
3264 bufstart = ptrs;
3265 hdr->nr_body = *ptrs++;
3266 src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3267 hdr->nr_options = *ptrs;
3268
3269 if (!rerror) {
3270 /* copy the body */
3271 bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3272 error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3273 if (error) {
3274 rerror = error;
3275 goto out;
3276 }
3277 }
3278
3279 /* copy the options */
3280 dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3281 while (src) {
3282 size_t optsz;
3283 uint64_t next;
3284
3285 /* restore the user pointer */
3286 next = src->nro_next;
3287 ptrs = (uint64_t *)src - 1;
3288 src->nro_next = *ptrs;
3289
3290 /* always copy the option header */
3291 error = copyout(src, dst, sizeof(*src));
3292 if (error) {
3293 rerror = error;
3294 goto out;
3295 }
3296
3297 /* copy the option body only if there was no error */
3298 if (!rerror && !src->nro_status) {
3299 optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3300 src->nro_size);
3301 if (optsz) {
3302 error = copyout(src + 1, dst + 1, optsz);
3303 if (error) {
3304 rerror = error;
3305 goto out;
3306 }
3307 }
3308 }
3309 src = (struct nmreq_option *)(uintptr_t)next;
3310 dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3311 }
3312
3313
3314 out:
3315 hdr->nr_reserved = 0;
3316 nm_os_free(bufstart);
3317 return rerror;
3318 }
3319
3320 struct nmreq_option *
nmreq_getoption(struct nmreq_header * hdr,uint16_t reqtype)3321 nmreq_getoption(struct nmreq_header *hdr, uint16_t reqtype)
3322 {
3323 struct nmreq_option **opt_tab;
3324
3325 if (!hdr->nr_options)
3326 return NULL;
3327
3328 opt_tab = (struct nmreq_option **)((uintptr_t)hdr->nr_options) -
3329 (NETMAP_REQ_OPT_MAX + 1);
3330 return opt_tab[reqtype];
3331 }
3332
3333 static int
nmreq_checkoptions(struct nmreq_header * hdr)3334 nmreq_checkoptions(struct nmreq_header *hdr)
3335 {
3336 struct nmreq_option *opt;
3337 /* return error if there is still any option
3338 * marked as not supported
3339 */
3340
3341 for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3342 opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3343 if (opt->nro_status == EOPNOTSUPP)
3344 return EOPNOTSUPP;
3345
3346 return 0;
3347 }
3348
3349 /*
3350 * select(2) and poll(2) handlers for the "netmap" device.
3351 *
3352 * Can be called for one or more queues.
3353 * Return true the event mask corresponding to ready events.
3354 * If there are no ready events (and 'sr' is not NULL), do a
3355 * selrecord on either individual selinfo or on the global one.
3356 * Device-dependent parts (locking and sync of tx/rx rings)
3357 * are done through callbacks.
3358 *
3359 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3360 * The first one is remapped to pwait as selrecord() uses the name as an
3361 * hidden argument.
3362 */
3363 int
netmap_poll(struct netmap_priv_d * priv,int events,NM_SELRECORD_T * sr)3364 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3365 {
3366 struct netmap_adapter *na;
3367 struct netmap_kring *kring;
3368 struct netmap_ring *ring;
3369 u_int i, want[NR_TXRX], revents = 0;
3370 NM_SELINFO_T *si[NR_TXRX];
3371 #define want_tx want[NR_TX]
3372 #define want_rx want[NR_RX]
3373 struct mbq q; /* packets from RX hw queues to host stack */
3374
3375 /*
3376 * In order to avoid nested locks, we need to "double check"
3377 * txsync and rxsync if we decide to do a selrecord().
3378 * retry_tx (and retry_rx, later) prevent looping forever.
3379 */
3380 int retry_tx = 1, retry_rx = 1;
3381
3382 /* Transparent mode: send_down is 1 if we have found some
3383 * packets to forward (host RX ring --> NIC) during the rx
3384 * scan and we have not sent them down to the NIC yet.
3385 * Transparent mode requires to bind all rings to a single
3386 * file descriptor.
3387 */
3388 int send_down = 0;
3389 int sync_flags = priv->np_sync_flags;
3390
3391 mbq_init(&q);
3392
3393 if (unlikely(priv->np_nifp == NULL)) {
3394 return POLLERR;
3395 }
3396 mb(); /* make sure following reads are not from cache */
3397
3398 na = priv->np_na;
3399
3400 if (unlikely(!nm_netmap_on(na)))
3401 return POLLERR;
3402
3403 if (unlikely(priv->np_csb_atok_base)) {
3404 nm_prerr("Invalid poll in CSB mode");
3405 return POLLERR;
3406 }
3407
3408 if (netmap_debug & NM_DEBUG_ON)
3409 nm_prinf("device %s events 0x%x", na->name, events);
3410 want_tx = events & (POLLOUT | POLLWRNORM);
3411 want_rx = events & (POLLIN | POLLRDNORM);
3412
3413 /*
3414 * If the card has more than one queue AND the file descriptor is
3415 * bound to all of them, we sleep on the "global" selinfo, otherwise
3416 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3417 * per file descriptor).
3418 * The interrupt routine in the driver wake one or the other
3419 * (or both) depending on which clients are active.
3420 *
3421 * rxsync() is only called if we run out of buffers on a POLLIN.
3422 * txsync() is called if we run out of buffers on POLLOUT, or
3423 * there are pending packets to send. The latter can be disabled
3424 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3425 */
3426 si[NR_RX] = priv->np_si[NR_RX];
3427 si[NR_TX] = priv->np_si[NR_TX];
3428
3429 #ifdef __FreeBSD__
3430 /*
3431 * We start with a lock free round which is cheap if we have
3432 * slots available. If this fails, then lock and call the sync
3433 * routines. We can't do this on Linux, as the contract says
3434 * that we must call nm_os_selrecord() unconditionally.
3435 */
3436 if (want_tx) {
3437 const enum txrx t = NR_TX;
3438 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3439 kring = NMR(na, t)[i];
3440 if (kring->ring->cur != kring->ring->tail) {
3441 /* Some unseen TX space is available, so what
3442 * we don't need to run txsync. */
3443 revents |= want[t];
3444 want[t] = 0;
3445 break;
3446 }
3447 }
3448 }
3449 if (want_rx) {
3450 const enum txrx t = NR_RX;
3451 int rxsync_needed = 0;
3452
3453 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3454 kring = NMR(na, t)[i];
3455 if (kring->ring->cur == kring->ring->tail
3456 || kring->rhead != kring->ring->head) {
3457 /* There are no unseen packets on this ring,
3458 * or there are some buffers to be returned
3459 * to the netmap port. We therefore go ahead
3460 * and run rxsync. */
3461 rxsync_needed = 1;
3462 break;
3463 }
3464 }
3465 if (!rxsync_needed) {
3466 revents |= want_rx;
3467 want_rx = 0;
3468 }
3469 }
3470 #endif
3471
3472 #ifdef linux
3473 /* The selrecord must be unconditional on linux. */
3474 nm_os_selrecord(sr, si[NR_RX]);
3475 nm_os_selrecord(sr, si[NR_TX]);
3476 #endif /* linux */
3477
3478 /*
3479 * If we want to push packets out (priv->np_txpoll) or
3480 * want_tx is still set, we must issue txsync calls
3481 * (on all rings, to avoid that the tx rings stall).
3482 * Fortunately, normal tx mode has np_txpoll set.
3483 */
3484 if (priv->np_txpoll || want_tx) {
3485 /*
3486 * The first round checks if anyone is ready, if not
3487 * do a selrecord and another round to handle races.
3488 * want_tx goes to 0 if any space is found, and is
3489 * used to skip rings with no pending transmissions.
3490 */
3491 flush_tx:
3492 for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3493 int found = 0;
3494
3495 kring = na->tx_rings[i];
3496 ring = kring->ring;
3497
3498 /*
3499 * Don't try to txsync this TX ring if we already found some
3500 * space in some of the TX rings (want_tx == 0) and there are no
3501 * TX slots in this ring that need to be flushed to the NIC
3502 * (head == hwcur).
3503 */
3504 if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3505 continue;
3506
3507 if (nm_kr_tryget(kring, 1, &revents))
3508 continue;
3509
3510 if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3511 netmap_ring_reinit(kring);
3512 revents |= POLLERR;
3513 } else {
3514 if (kring->nm_sync(kring, sync_flags))
3515 revents |= POLLERR;
3516 else
3517 nm_sync_finalize(kring);
3518 }
3519
3520 /*
3521 * If we found new slots, notify potential
3522 * listeners on the same ring.
3523 * Since we just did a txsync, look at the copies
3524 * of cur,tail in the kring.
3525 */
3526 found = kring->rcur != kring->rtail;
3527 nm_kr_put(kring);
3528 if (found) { /* notify other listeners */
3529 revents |= want_tx;
3530 want_tx = 0;
3531 #ifndef linux
3532 kring->nm_notify(kring, 0);
3533 #endif /* linux */
3534 }
3535 }
3536 /* if there were any packet to forward we must have handled them by now */
3537 send_down = 0;
3538 if (want_tx && retry_tx && sr) {
3539 #ifndef linux
3540 nm_os_selrecord(sr, si[NR_TX]);
3541 #endif /* !linux */
3542 retry_tx = 0;
3543 goto flush_tx;
3544 }
3545 }
3546
3547 /*
3548 * If want_rx is still set scan receive rings.
3549 * Do it on all rings because otherwise we starve.
3550 */
3551 if (want_rx) {
3552 /* two rounds here for race avoidance */
3553 do_retry_rx:
3554 for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3555 int found = 0;
3556
3557 kring = na->rx_rings[i];
3558 ring = kring->ring;
3559
3560 if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3561 continue;
3562
3563 if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3564 netmap_ring_reinit(kring);
3565 revents |= POLLERR;
3566 }
3567 /* now we can use kring->rcur, rtail */
3568
3569 /*
3570 * transparent mode support: collect packets from
3571 * hw rxring(s) that have been released by the user
3572 */
3573 if (nm_may_forward_up(kring)) {
3574 netmap_grab_packets(kring, &q, netmap_fwd);
3575 }
3576
3577 /* Clear the NR_FORWARD flag anyway, it may be set by
3578 * the nm_sync() below only on for the host RX ring (see
3579 * netmap_rxsync_from_host()). */
3580 kring->nr_kflags &= ~NR_FORWARD;
3581 if (kring->nm_sync(kring, sync_flags))
3582 revents |= POLLERR;
3583 else
3584 nm_sync_finalize(kring);
3585 send_down |= (kring->nr_kflags & NR_FORWARD);
3586 ring_timestamp_set(ring);
3587 found = kring->rcur != kring->rtail;
3588 nm_kr_put(kring);
3589 if (found) {
3590 revents |= want_rx;
3591 retry_rx = 0;
3592 #ifndef linux
3593 kring->nm_notify(kring, 0);
3594 #endif /* linux */
3595 }
3596 }
3597
3598 #ifndef linux
3599 if (retry_rx && sr) {
3600 nm_os_selrecord(sr, si[NR_RX]);
3601 }
3602 #endif /* !linux */
3603 if (send_down || retry_rx) {
3604 retry_rx = 0;
3605 if (send_down)
3606 goto flush_tx; /* and retry_rx */
3607 else
3608 goto do_retry_rx;
3609 }
3610 }
3611
3612 /*
3613 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3614 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3615 * to the host stack.
3616 */
3617
3618 if (mbq_peek(&q)) {
3619 netmap_send_up(na->ifp, &q);
3620 }
3621
3622 return (revents);
3623 #undef want_tx
3624 #undef want_rx
3625 }
3626
3627 int
nma_intr_enable(struct netmap_adapter * na,int onoff)3628 nma_intr_enable(struct netmap_adapter *na, int onoff)
3629 {
3630 bool changed = false;
3631 enum txrx t;
3632 int i;
3633
3634 for_rx_tx(t) {
3635 for (i = 0; i < nma_get_nrings(na, t); i++) {
3636 struct netmap_kring *kring = NMR(na, t)[i];
3637 int on = !(kring->nr_kflags & NKR_NOINTR);
3638
3639 if (!!onoff != !!on) {
3640 changed = true;
3641 }
3642 if (onoff) {
3643 kring->nr_kflags &= ~NKR_NOINTR;
3644 } else {
3645 kring->nr_kflags |= NKR_NOINTR;
3646 }
3647 }
3648 }
3649
3650 if (!changed) {
3651 return 0; /* nothing to do */
3652 }
3653
3654 if (!na->nm_intr) {
3655 nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3656 na->name);
3657 return -1;
3658 }
3659
3660 na->nm_intr(na, onoff);
3661
3662 return 0;
3663 }
3664
3665
3666 /*-------------------- driver support routines -------------------*/
3667
3668 /* default notify callback */
3669 static int
netmap_notify(struct netmap_kring * kring,int flags)3670 netmap_notify(struct netmap_kring *kring, int flags)
3671 {
3672 struct netmap_adapter *na = kring->notify_na;
3673 enum txrx t = kring->tx;
3674
3675 nm_os_selwakeup(&kring->si);
3676 /* optimization: avoid a wake up on the global
3677 * queue if nobody has registered for more
3678 * than one ring
3679 */
3680 if (na->si_users[t] > 0)
3681 nm_os_selwakeup(&na->si[t]);
3682
3683 return NM_IRQ_COMPLETED;
3684 }
3685
3686 /* called by all routines that create netmap_adapters.
3687 * provide some defaults and get a reference to the
3688 * memory allocator
3689 */
3690 int
netmap_attach_common(struct netmap_adapter * na)3691 netmap_attach_common(struct netmap_adapter *na)
3692 {
3693 if (!na->rx_buf_maxsize) {
3694 /* Set a conservative default (larger is safer). */
3695 na->rx_buf_maxsize = PAGE_SIZE;
3696 }
3697
3698 #ifdef __FreeBSD__
3699 if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3700 na->if_input = na->ifp->if_input; /* for netmap_send_up */
3701 }
3702 na->pdev = na; /* make sure netmap_mem_map() is called */
3703 #endif /* __FreeBSD__ */
3704 if (na->na_flags & NAF_HOST_RINGS) {
3705 if (na->num_host_rx_rings == 0)
3706 na->num_host_rx_rings = 1;
3707 if (na->num_host_tx_rings == 0)
3708 na->num_host_tx_rings = 1;
3709 }
3710 if (na->nm_krings_create == NULL) {
3711 /* we assume that we have been called by a driver,
3712 * since other port types all provide their own
3713 * nm_krings_create
3714 */
3715 na->nm_krings_create = netmap_hw_krings_create;
3716 na->nm_krings_delete = netmap_hw_krings_delete;
3717 }
3718 if (na->nm_notify == NULL)
3719 na->nm_notify = netmap_notify;
3720 na->active_fds = 0;
3721
3722 if (na->nm_mem == NULL) {
3723 /* use the global allocator */
3724 na->nm_mem = netmap_mem_get(&nm_mem);
3725 }
3726 #ifdef WITH_VALE
3727 if (na->nm_bdg_attach == NULL)
3728 /* no special nm_bdg_attach callback. On VALE
3729 * attach, we need to interpose a bwrap
3730 */
3731 na->nm_bdg_attach = netmap_default_bdg_attach;
3732 #endif
3733
3734 return 0;
3735 }
3736
3737 /* Wrapper for the register callback provided netmap-enabled
3738 * hardware drivers.
3739 * nm_iszombie(na) means that the driver module has been
3740 * unloaded, so we cannot call into it.
3741 * nm_os_ifnet_lock() must guarantee mutual exclusion with
3742 * module unloading.
3743 */
3744 static int
netmap_hw_reg(struct netmap_adapter * na,int onoff)3745 netmap_hw_reg(struct netmap_adapter *na, int onoff)
3746 {
3747 struct netmap_hw_adapter *hwna =
3748 (struct netmap_hw_adapter*)na;
3749 int error = 0;
3750
3751 nm_os_ifnet_lock();
3752
3753 if (nm_iszombie(na)) {
3754 if (onoff) {
3755 error = ENXIO;
3756 } else if (na != NULL) {
3757 na->na_flags &= ~NAF_NETMAP_ON;
3758 }
3759 goto out;
3760 }
3761
3762 error = hwna->nm_hw_register(na, onoff);
3763
3764 out:
3765 nm_os_ifnet_unlock();
3766
3767 return error;
3768 }
3769
3770 static void
netmap_hw_dtor(struct netmap_adapter * na)3771 netmap_hw_dtor(struct netmap_adapter *na)
3772 {
3773 if (na->ifp == NULL)
3774 return;
3775
3776 NM_DETACH_NA(na->ifp);
3777 }
3778
3779
3780 /*
3781 * Allocate a netmap_adapter object, and initialize it from the
3782 * 'arg' passed by the driver on attach.
3783 * We allocate a block of memory of 'size' bytes, which has room
3784 * for struct netmap_adapter plus additional room private to
3785 * the caller.
3786 * Return 0 on success, ENOMEM otherwise.
3787 */
3788 int
netmap_attach_ext(struct netmap_adapter * arg,size_t size,int override_reg)3789 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3790 {
3791 struct netmap_hw_adapter *hwna = NULL;
3792 struct ifnet *ifp = NULL;
3793
3794 if (size < sizeof(struct netmap_hw_adapter)) {
3795 if (netmap_debug & NM_DEBUG_ON)
3796 nm_prerr("Invalid netmap adapter size %d", (int)size);
3797 return EINVAL;
3798 }
3799
3800 if (arg == NULL || arg->ifp == NULL) {
3801 if (netmap_debug & NM_DEBUG_ON)
3802 nm_prerr("either arg or arg->ifp is NULL");
3803 return EINVAL;
3804 }
3805
3806 if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
3807 if (netmap_debug & NM_DEBUG_ON)
3808 nm_prerr("%s: invalid rings tx %d rx %d",
3809 arg->name, arg->num_tx_rings, arg->num_rx_rings);
3810 return EINVAL;
3811 }
3812
3813 ifp = arg->ifp;
3814 if (NM_NA_CLASH(ifp)) {
3815 /* If NA(ifp) is not null but there is no valid netmap
3816 * adapter it means that someone else is using the same
3817 * pointer (e.g. ax25_ptr on linux). This happens for
3818 * instance when also PF_RING is in use. */
3819 nm_prerr("Error: netmap adapter hook is busy");
3820 return EBUSY;
3821 }
3822
3823 hwna = nm_os_malloc(size);
3824 if (hwna == NULL)
3825 goto fail;
3826 hwna->up = *arg;
3827 hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3828 strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3829 if (override_reg) {
3830 hwna->nm_hw_register = hwna->up.nm_register;
3831 hwna->up.nm_register = netmap_hw_reg;
3832 }
3833 if (netmap_attach_common(&hwna->up)) {
3834 nm_os_free(hwna);
3835 goto fail;
3836 }
3837 netmap_adapter_get(&hwna->up);
3838
3839 NM_ATTACH_NA(ifp, &hwna->up);
3840
3841 nm_os_onattach(ifp);
3842
3843 if (arg->nm_dtor == NULL) {
3844 hwna->up.nm_dtor = netmap_hw_dtor;
3845 }
3846
3847 if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3848 hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3849 hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3850 return 0;
3851
3852 fail:
3853 nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3854 return (hwna ? EINVAL : ENOMEM);
3855 }
3856
3857
3858 int
netmap_attach(struct netmap_adapter * arg)3859 netmap_attach(struct netmap_adapter *arg)
3860 {
3861 return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3862 1 /* override nm_reg */);
3863 }
3864
3865
3866 void
NM_DBG(netmap_adapter_get)3867 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3868 {
3869 if (!na) {
3870 return;
3871 }
3872
3873 refcount_acquire(&na->na_refcount);
3874 }
3875
3876
3877 /* returns 1 iff the netmap_adapter is destroyed */
3878 int
NM_DBG(netmap_adapter_put)3879 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3880 {
3881 if (!na)
3882 return 1;
3883
3884 if (!refcount_release(&na->na_refcount))
3885 return 0;
3886
3887 if (na->nm_dtor)
3888 na->nm_dtor(na);
3889
3890 if (na->tx_rings) { /* XXX should not happen */
3891 if (netmap_debug & NM_DEBUG_ON)
3892 nm_prerr("freeing leftover tx_rings");
3893 na->nm_krings_delete(na);
3894 }
3895 netmap_pipe_dealloc(na);
3896 if (na->nm_mem)
3897 netmap_mem_put(na->nm_mem);
3898 bzero(na, sizeof(*na));
3899 nm_os_free(na);
3900
3901 return 1;
3902 }
3903
3904 /* nm_krings_create callback for all hardware native adapters */
3905 int
netmap_hw_krings_create(struct netmap_adapter * na)3906 netmap_hw_krings_create(struct netmap_adapter *na)
3907 {
3908 int ret = netmap_krings_create(na, 0);
3909 if (ret == 0) {
3910 /* initialize the mbq for the sw rx ring */
3911 u_int lim = netmap_real_rings(na, NR_RX), i;
3912 for (i = na->num_rx_rings; i < lim; i++) {
3913 mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3914 }
3915 nm_prdis("initialized sw rx queue %d", na->num_rx_rings);
3916 }
3917 return ret;
3918 }
3919
3920
3921
3922 /*
3923 * Called on module unload by the netmap-enabled drivers
3924 */
3925 void
netmap_detach(struct ifnet * ifp)3926 netmap_detach(struct ifnet *ifp)
3927 {
3928 struct netmap_adapter *na = NA(ifp);
3929
3930 if (!na)
3931 return;
3932
3933 NMG_LOCK();
3934 netmap_set_all_rings(na, NM_KR_LOCKED);
3935 /*
3936 * if the netmap adapter is not native, somebody
3937 * changed it, so we can not release it here.
3938 * The NAF_ZOMBIE flag will notify the new owner that
3939 * the driver is gone.
3940 */
3941 if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3942 na->na_flags |= NAF_ZOMBIE;
3943 }
3944 /* give active users a chance to notice that NAF_ZOMBIE has been
3945 * turned on, so that they can stop and return an error to userspace.
3946 * Note that this becomes a NOP if there are no active users and,
3947 * therefore, the put() above has deleted the na, since now NA(ifp) is
3948 * NULL.
3949 */
3950 netmap_enable_all_rings(ifp);
3951 NMG_UNLOCK();
3952 }
3953
3954
3955 /*
3956 * Intercept packets from the network stack and pass them
3957 * to netmap as incoming packets on the 'software' ring.
3958 *
3959 * We only store packets in a bounded mbq and then copy them
3960 * in the relevant rxsync routine.
3961 *
3962 * We rely on the OS to make sure that the ifp and na do not go
3963 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3964 * In nm_register() or whenever there is a reinitialization,
3965 * we make sure to make the mode change visible here.
3966 */
3967 int
netmap_transmit(struct ifnet * ifp,struct mbuf * m)3968 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3969 {
3970 struct netmap_adapter *na = NA(ifp);
3971 struct netmap_kring *kring, *tx_kring;
3972 u_int len = MBUF_LEN(m);
3973 u_int error = ENOBUFS;
3974 unsigned int txr;
3975 struct mbq *q;
3976 int busy;
3977 u_int i;
3978
3979 i = MBUF_TXQ(m);
3980 if (i >= na->num_host_rx_rings) {
3981 i = i % na->num_host_rx_rings;
3982 }
3983 kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3984
3985 // XXX [Linux] we do not need this lock
3986 // if we follow the down/configure/up protocol -gl
3987 // mtx_lock(&na->core_lock);
3988
3989 if (!nm_netmap_on(na)) {
3990 nm_prerr("%s not in netmap mode anymore", na->name);
3991 error = ENXIO;
3992 goto done;
3993 }
3994
3995 txr = MBUF_TXQ(m);
3996 if (txr >= na->num_tx_rings) {
3997 txr %= na->num_tx_rings;
3998 }
3999 tx_kring = NMR(na, NR_TX)[txr];
4000
4001 if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
4002 return MBUF_TRANSMIT(na, ifp, m);
4003 }
4004
4005 q = &kring->rx_queue;
4006
4007 // XXX reconsider long packets if we handle fragments
4008 if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
4009 nm_prerr("%s from_host, drop packet size %d > %d", na->name,
4010 len, NETMAP_BUF_SIZE(na));
4011 goto done;
4012 }
4013
4014 if (!netmap_generic_hwcsum) {
4015 if (nm_os_mbuf_has_csum_offld(m)) {
4016 nm_prlim(1, "%s drop mbuf that needs checksum offload", na->name);
4017 goto done;
4018 }
4019 }
4020
4021 if (nm_os_mbuf_has_seg_offld(m)) {
4022 nm_prlim(1, "%s drop mbuf that needs generic segmentation offload", na->name);
4023 goto done;
4024 }
4025
4026 #ifdef __FreeBSD__
4027 ETHER_BPF_MTAP(ifp, m);
4028 #endif /* __FreeBSD__ */
4029
4030 /* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
4031 * and maybe other instances of netmap_transmit (the latter
4032 * not possible on Linux).
4033 * We enqueue the mbuf only if we are sure there is going to be
4034 * enough room in the host RX ring, otherwise we drop it.
4035 */
4036 mbq_lock(q);
4037
4038 busy = kring->nr_hwtail - kring->nr_hwcur;
4039 if (busy < 0)
4040 busy += kring->nkr_num_slots;
4041 if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
4042 nm_prlim(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
4043 kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
4044 } else {
4045 mbq_enqueue(q, m);
4046 nm_prdis(2, "%s %d bufs in queue", na->name, mbq_len(q));
4047 /* notify outside the lock */
4048 m = NULL;
4049 error = 0;
4050 }
4051 mbq_unlock(q);
4052
4053 done:
4054 if (m)
4055 m_freem(m);
4056 /* unconditionally wake up listeners */
4057 kring->nm_notify(kring, 0);
4058 /* this is normally netmap_notify(), but for nics
4059 * connected to a bridge it is netmap_bwrap_intr_notify(),
4060 * that possibly forwards the frames through the switch
4061 */
4062
4063 return (error);
4064 }
4065
4066
4067 /*
4068 * Reset function to be called by the driver routines when reinitializing
4069 * a hardware ring. The driver is in charge of locking to protect the kring
4070 * while this operation is being performed. This is normally achieved by
4071 * calling netmap_disable_all_rings() before triggering a reset.
4072 * If the kring is not in netmap mode, return NULL to inform the caller
4073 * that this is the case.
4074 * If the kring is in netmap mode, set hwofs so that the netmap indices
4075 * seen by userspace (head/cut/tail) do not change, although the internal
4076 * NIC indices have been reset to 0.
4077 * In any case, adjust kring->nr_mode.
4078 */
4079 struct netmap_slot *
netmap_reset(struct netmap_adapter * na,enum txrx tx,u_int n,u_int new_cur)4080 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
4081 u_int new_cur)
4082 {
4083 struct netmap_kring *kring;
4084 u_int new_hwtail, new_hwofs;
4085
4086 if (!nm_native_on(na)) {
4087 nm_prdis("interface not in native netmap mode");
4088 return NULL; /* nothing to reinitialize */
4089 }
4090
4091 if (tx == NR_TX) {
4092 if (n >= na->num_tx_rings)
4093 return NULL;
4094 kring = na->tx_rings[n];
4095 /*
4096 * Set hwofs to rhead, so that slots[rhead] is mapped to
4097 * the NIC internal slot 0, and thus the netmap buffer
4098 * at rhead is the next to be transmitted. Transmissions
4099 * that were pending before the reset are considered as
4100 * sent, so that we can have hwcur = rhead. All the slots
4101 * are now owned by the user, so we can also reinit hwtail.
4102 */
4103 new_hwofs = kring->rhead;
4104 new_hwtail = nm_prev(kring->rhead, kring->nkr_num_slots - 1);
4105 } else {
4106 if (n >= na->num_rx_rings)
4107 return NULL;
4108 kring = na->rx_rings[n];
4109 /*
4110 * Set hwofs to hwtail, so that slots[hwtail] is mapped to
4111 * the NIC internal slot 0, and thus the netmap buffer
4112 * at hwtail is the next to be given to the NIC.
4113 * Unread slots (the ones in [rhead,hwtail[) are owned by
4114 * the user, and thus the caller cannot give them
4115 * to the NIC right now.
4116 */
4117 new_hwofs = kring->nr_hwtail;
4118 new_hwtail = kring->nr_hwtail;
4119 }
4120 if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
4121 kring->nr_mode = NKR_NETMAP_OFF;
4122 return NULL;
4123 }
4124 if (netmap_verbose) {
4125 nm_prinf("%s, hc %u->%u, ht %u->%u, ho %u->%u", kring->name,
4126 kring->nr_hwcur, kring->rhead,
4127 kring->nr_hwtail, new_hwtail,
4128 kring->nkr_hwofs, new_hwofs);
4129 }
4130 kring->nr_hwcur = kring->rhead;
4131 kring->nr_hwtail = new_hwtail;
4132 kring->nkr_hwofs = new_hwofs;
4133
4134 /*
4135 * Wakeup on the individual and global selwait
4136 * We do the wakeup here, but the ring is not yet reconfigured.
4137 * However, we are under lock so there are no races.
4138 */
4139 kring->nr_mode = NKR_NETMAP_ON;
4140 kring->nm_notify(kring, 0);
4141 return kring->ring->slot;
4142 }
4143
4144
4145 /*
4146 * Dispatch rx/tx interrupts to the netmap rings.
4147 *
4148 * "work_done" is non-null on the RX path, NULL for the TX path.
4149 * We rely on the OS to make sure that there is only one active
4150 * instance per queue, and that there is appropriate locking.
4151 *
4152 * The 'notify' routine depends on what the ring is attached to.
4153 * - for a netmap file descriptor, do a selwakeup on the individual
4154 * waitqueue, plus one on the global one if needed
4155 * (see netmap_notify)
4156 * - for a nic connected to a switch, call the proper forwarding routine
4157 * (see netmap_bwrap_intr_notify)
4158 */
4159 int
netmap_common_irq(struct netmap_adapter * na,u_int q,u_int * work_done)4160 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4161 {
4162 struct netmap_kring *kring;
4163 enum txrx t = (work_done ? NR_RX : NR_TX);
4164
4165 q &= NETMAP_RING_MASK;
4166
4167 if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4168 nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4169 }
4170
4171 if (q >= nma_get_nrings(na, t))
4172 return NM_IRQ_PASS; // not a physical queue
4173
4174 kring = NMR(na, t)[q];
4175
4176 if (kring->nr_mode == NKR_NETMAP_OFF) {
4177 return NM_IRQ_PASS;
4178 }
4179
4180 if (t == NR_RX) {
4181 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ?
4182 *work_done = 1; /* do not fire napi again */
4183 }
4184
4185 return kring->nm_notify(kring, 0);
4186 }
4187
4188
4189 /*
4190 * Default functions to handle rx/tx interrupts from a physical device.
4191 * "work_done" is non-null on the RX path, NULL for the TX path.
4192 *
4193 * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4194 * so that the caller proceeds with regular processing.
4195 * Otherwise call netmap_common_irq().
4196 *
4197 * If the card is connected to a netmap file descriptor,
4198 * do a selwakeup on the individual queue, plus one on the global one
4199 * if needed (multiqueue card _and_ there are multiqueue listeners),
4200 * and return NR_IRQ_COMPLETED.
4201 *
4202 * Finally, if called on rx from an interface connected to a switch,
4203 * calls the proper forwarding routine.
4204 */
4205 int
netmap_rx_irq(struct ifnet * ifp,u_int q,u_int * work_done)4206 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4207 {
4208 struct netmap_adapter *na = NA(ifp);
4209
4210 /*
4211 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4212 * we still use the regular driver even though the previous
4213 * check fails. It is unclear whether we should use
4214 * nm_native_on() here.
4215 */
4216 if (!nm_netmap_on(na))
4217 return NM_IRQ_PASS;
4218
4219 if (na->na_flags & NAF_SKIP_INTR) {
4220 nm_prdis("use regular interrupt");
4221 return NM_IRQ_PASS;
4222 }
4223
4224 return netmap_common_irq(na, q, work_done);
4225 }
4226
4227 /* set/clear native flags and if_transmit/netdev_ops */
4228 void
nm_set_native_flags(struct netmap_adapter * na)4229 nm_set_native_flags(struct netmap_adapter *na)
4230 {
4231 struct ifnet *ifp = na->ifp;
4232
4233 /* We do the setup for intercepting packets only if we are the
4234 * first user of this adapter. */
4235 if (na->active_fds > 0) {
4236 return;
4237 }
4238
4239 na->na_flags |= NAF_NETMAP_ON;
4240 nm_os_onenter(ifp);
4241 nm_update_hostrings_mode(na);
4242 }
4243
4244 void
nm_clear_native_flags(struct netmap_adapter * na)4245 nm_clear_native_flags(struct netmap_adapter *na)
4246 {
4247 struct ifnet *ifp = na->ifp;
4248
4249 /* We undo the setup for intercepting packets only if we are the
4250 * last user of this adapter. */
4251 if (na->active_fds > 0) {
4252 return;
4253 }
4254
4255 nm_update_hostrings_mode(na);
4256 nm_os_onexit(ifp);
4257
4258 na->na_flags &= ~NAF_NETMAP_ON;
4259 }
4260
4261 void
netmap_krings_mode_commit(struct netmap_adapter * na,int onoff)4262 netmap_krings_mode_commit(struct netmap_adapter *na, int onoff)
4263 {
4264 enum txrx t;
4265
4266 for_rx_tx(t) {
4267 int i;
4268
4269 for (i = 0; i < netmap_real_rings(na, t); i++) {
4270 struct netmap_kring *kring = NMR(na, t)[i];
4271
4272 if (onoff && nm_kring_pending_on(kring))
4273 kring->nr_mode = NKR_NETMAP_ON;
4274 else if (!onoff && nm_kring_pending_off(kring))
4275 kring->nr_mode = NKR_NETMAP_OFF;
4276 }
4277 }
4278 }
4279
4280 /*
4281 * Module loader and unloader
4282 *
4283 * netmap_init() creates the /dev/netmap device and initializes
4284 * all global variables. Returns 0 on success, errno on failure
4285 * (but there is no chance)
4286 *
4287 * netmap_fini() destroys everything.
4288 */
4289
4290 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4291 extern struct cdevsw netmap_cdevsw;
4292
4293
4294 void
netmap_fini(void)4295 netmap_fini(void)
4296 {
4297 if (netmap_dev)
4298 destroy_dev(netmap_dev);
4299 /* we assume that there are no longer netmap users */
4300 nm_os_ifnet_fini();
4301 netmap_uninit_bridges();
4302 netmap_mem_fini();
4303 NMG_LOCK_DESTROY();
4304 nm_prinf("netmap: unloaded module.");
4305 }
4306
4307
4308 int
netmap_init(void)4309 netmap_init(void)
4310 {
4311 int error;
4312
4313 NMG_LOCK_INIT();
4314
4315 error = netmap_mem_init();
4316 if (error != 0)
4317 goto fail;
4318 /*
4319 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4320 * when the module is compiled in.
4321 * XXX could use make_dev_credv() to get error number
4322 */
4323 netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4324 &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4325 "netmap");
4326 if (!netmap_dev)
4327 goto fail;
4328
4329 error = netmap_init_bridges();
4330 if (error)
4331 goto fail;
4332
4333 #ifdef __FreeBSD__
4334 nm_os_vi_init_index();
4335 #endif
4336
4337 error = nm_os_ifnet_init();
4338 if (error)
4339 goto fail;
4340
4341 nm_prinf("netmap: loaded module");
4342 return (0);
4343 fail:
4344 netmap_fini();
4345 return (EINVAL); /* may be incorrect */
4346 }
4347