xref: /dragonfly/sys/net/netmap/netmap.c (revision f933b737dabc806a2f1680f0afea2fb42a345b92)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module supports memory mapped access to network devices,
29  * see netmap(4).
30  *
31  * The module uses a large, memory pool allocated by the kernel
32  * and accessible as mmapped memory by multiple userspace threads/processes.
33  * The memory pool contains packet buffers and "netmap rings",
34  * i.e. user-accessible copies of the interface's queues.
35  *
36  * Access to the network card works like this:
37  * 1. a process/thread issues one or more open() on /dev/netmap, to create
38  *    select()able file descriptor on which events are reported.
39  * 2. on each descriptor, the process issues an ioctl() to identify
40  *    the interface that should report events to the file descriptor.
41  * 3. on each descriptor, the process issues an mmap() request to
42  *    map the shared memory region within the process' address space.
43  *    The list of interesting queues is indicated by a location in
44  *    the shared memory region.
45  * 4. using the functions in the netmap(4) userspace API, a process
46  *    can look up the occupation state of a queue, access memory buffers,
47  *    and retrieve received packets or enqueue packets to transmit.
48  * 5. using some ioctl()s the process can synchronize the userspace view
49  *    of the queue with the actual status in the kernel. This includes both
50  *    receiving the notification of new packets, and transmitting new
51  *    packets on the output interface.
52  * 6. select() or poll() can be used to wait for events on individual
53  *    transmit or receive queues (or all queues for a given interface).
54  *
55 
56                     SYNCHRONIZATION (USER)
57 
58 The netmap rings and data structures may be shared among multiple
59 user threads or even independent processes.
60 Any synchronization among those threads/processes is delegated
61 to the threads themselves. Only one thread at a time can be in
62 a system call on the same netmap ring. The OS does not enforce
63 this and only guarantees against system crashes in case of
64 invalid usage.
65 
66                     LOCKING (INTERNAL)
67 
68 Within the kernel, access to the netmap rings is protected as follows:
69 
70 - a spinlock on each ring, to handle producer/consumer races on
71   RX rings attached to the host stack (against multiple host
72   threads writing from the host stack to the same ring),
73   and on 'destination' rings attached to a VALE switch
74   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
75   protecting multiple active senders for the same destination)
76 
77 - an atomic variable to guarantee that there is at most one
78   instance of *_*xsync() on the ring at any time.
79   For rings connected to user file
80   descriptors, an atomic_test_and_set() protects this, and the
81   lock on the ring is not actually used.
82   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
83   is also used to prevent multiple executions (the driver might indeed
84   already guarantee this).
85   For NIC TX rings connected to a VALE switch, the lock arbitrates
86   access to the queue (both when allocating buffers and when pushing
87   them out).
88 
89 - *xsync() should be protected against initializations of the card.
90   On FreeBSD most devices have the reset routine protected by
91   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
92   the RING protection on rx_reset(), this should be added.
93 
94   On linux there is an external lock on the tx path, which probably
95   also arbitrates access to the reset routine. XXX to be revised
96 
97 - a per-interface core_lock protecting access from the host stack
98   while interfaces may be detached from netmap mode.
99   XXX there should be no need for this lock if we detach the interfaces
100   only while they are down.
101 
102 
103 --- VALE SWITCH ---
104 
105 NMG_LOCK() serializes all modifications to switches and ports.
106 A switch cannot be deleted until all ports are gone.
107 
108 For each switch, an SX lock (RWlock on linux) protects
109 deletion of ports. When configuring or deleting a new port, the
110 lock is acquired in exclusive mode (after holding NMG_LOCK).
111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
112 The lock is held throughout the entire forwarding cycle,
113 during which the thread may incur in a page fault.
114 Hence it is important that sleepable shared locks are used.
115 
116 On the rx ring, the per-port lock is grabbed initially to reserve
117 a number of slot in the ring, then the lock is released,
118 packets are copied from source to destination, and then
119 the lock is acquired again and the receive ring is updated.
120 (A similar thing is done on the tx ring for NIC and host stack
121 ports attached to the switch)
122 
123  */
124 
125 /*
126  * OS-specific code that is used only within this file.
127  * Other OS-specific code that must be accessed by drivers
128  * is present in netmap_kern.h
129  */
130 
131 #include <sys/cdefs.h> /* prerequisite */
132 __FBSDID("$FreeBSD: head/sys/dev/netmap/netmap.c 257176 2013-10-26 17:58:36Z glebius $");
133 
134 #include <sys/types.h>
135 #include <sys/errno.h>
136 #include <sys/param.h>        /* defines used in kernel.h */
137 #include <sys/kernel.h>       /* types used in module initialization */
138 #include <sys/conf.h>         /* cdevsw struct, UID, GID */
139 #include <sys/devfs.h>
140 #include <sys/sockio.h>
141 #include <sys/socketvar.h>    /* struct socket */
142 #include <sys/malloc.h>
143 #include <sys/poll.h>
144 #include <sys/lock.h>
145 #include <sys/socket.h> /* sockaddrs */
146 #include <sys/event.h>
147 #include <sys/sysctl.h>
148 #include <net/if.h>
149 #include <net/if_var.h>
150 #include <net/bpf.h>                    /* BIOCIMMEDIATE */
151 #include <sys/bus.h>          /* bus_dmamap_* */
152 #include <sys/endian.h>
153 #include <sys/refcount.h>
154 
155 /* reduce conditional code */
156 #define init_waitqueue_head(x)          // only needed in linux
157 
158 extern struct dev_ops netmap_cdevsw;
159 
160 /*
161  * common headers
162  */
163 #include <net/netmap/netmap.h>
164 #include <net/netmap/netmap_kern.h>
165 #include <net/netmap/netmap_mem2.h>
166 
167 
168 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
169 
170 /*
171  * The following variables are used by the drivers and replicate
172  * fields in the global memory pool. They only refer to buffers
173  * used by physical interfaces.
174  */
175 u_int netmap_total_buffers;
176 u_int netmap_buf_size;
177 char *netmap_buffer_base;     /* also address of an invalid buffer */
178 
179 /* user-controlled variables */
180 int netmap_verbose;
181 
182 static int netmap_no_timestamp; /* don't timestamp on rxsync */
183 
184 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
185 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
186     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
187 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
188     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
189 int netmap_mitigate = 1;
190 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
191 int netmap_no_pendintr = 1;
192 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
193     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
194 int netmap_txsync_retry = 2;
195 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
196     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
197 
198 int netmap_flags = 0;         /* debug flags */
199 int netmap_fwd = 0; /* force transparent mode */
200 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
201 
202 /*
203  * netmap_admode selects the netmap mode to use.
204  * Invalid values are reset to NETMAP_ADMODE_BEST
205  */
206 enum { NETMAP_ADMODE_BEST = 0,          /* use native, fallback to generic */
207           NETMAP_ADMODE_NATIVE,         /* either native or none */
208           NETMAP_ADMODE_GENERIC,        /* force generic */
209           NETMAP_ADMODE_LAST };
210 #define NETMAP_ADMODE_NATIVE        1  /* Force native netmap adapter. */
211 #define NETMAP_ADMODE_GENERIC       2  /* Force generic netmap adapter. */
212 #define NETMAP_ADMODE_BEST          0  /* Priority to native netmap adapter. */
213 static int netmap_admode = NETMAP_ADMODE_BEST;
214 
215 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
216 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
217 
218 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
219 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
220 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
221 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
222 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
223 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
224 
225 NMG_LOCK_T          netmap_global_lock;
226 
227 
228 static void
nm_kr_get(struct netmap_kring * kr)229 nm_kr_get(struct netmap_kring *kr)
230 {
231           while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
232                     tsleep(kr, 0, "NM_KR_GET", 4);
233 }
234 
235 
236 void
netmap_disable_ring(struct netmap_kring * kr)237 netmap_disable_ring(struct netmap_kring *kr)
238 {
239           kr->nkr_stopped = 1;
240           nm_kr_get(kr);
241           lockmgr(&kr->q_lock, LK_EXCLUSIVE);
242           lockmgr(&kr->q_lock, LK_RELEASE);
243           nm_kr_put(kr);
244 }
245 
246 
247 static void
netmap_set_all_rings(struct ifnet * ifp,int stopped)248 netmap_set_all_rings(struct ifnet *ifp, int stopped)
249 {
250           struct netmap_adapter *na;
251           int i;
252 
253           if (!(ifp->if_capenable & IFCAP_NETMAP))
254                     return;
255 
256           na = NA(ifp);
257 
258           for (i = 0; i <= na->num_tx_rings; i++) {
259                     if (stopped)
260                               netmap_disable_ring(na->tx_rings + i);
261                     else
262                               na->tx_rings[i].nkr_stopped = 0;
263                     na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
264                               (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
265           }
266 
267           for (i = 0; i <= na->num_rx_rings; i++) {
268                     if (stopped)
269                               netmap_disable_ring(na->rx_rings + i);
270                     else
271                               na->rx_rings[i].nkr_stopped = 0;
272                     na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
273                               (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
274           }
275 }
276 
277 
278 void
netmap_disable_all_rings(struct ifnet * ifp)279 netmap_disable_all_rings(struct ifnet *ifp)
280 {
281           netmap_set_all_rings(ifp, 1 /* stopped */);
282 }
283 
284 
285 void
netmap_enable_all_rings(struct ifnet * ifp)286 netmap_enable_all_rings(struct ifnet *ifp)
287 {
288           netmap_set_all_rings(ifp, 0 /* enabled */);
289 }
290 
291 
292 /*
293  * generic bound_checking function
294  */
295 u_int
nm_bound_var(u_int * v,u_int dflt,u_int lo,u_int hi,const char * msg)296 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
297 {
298           u_int oldv = *v;
299           const char *op = NULL;
300 
301           if (dflt < lo)
302                     dflt = lo;
303           if (dflt > hi)
304                     dflt = hi;
305           if (oldv < lo) {
306                     *v = dflt;
307                     op = "Bump";
308           } else if (oldv > hi) {
309                     *v = hi;
310                     op = "Clamp";
311           }
312           if (op && msg)
313                     kprintf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
314           return *v;
315 }
316 
317 
318 /*
319  * packet-dump function, user-supplied or static buffer.
320  * The destination buffer must be at least 30+4*len
321  */
322 const char *
nm_dump_buf(char * p,int len,int lim,char * dst)323 nm_dump_buf(char *p, int len, int lim, char *dst)
324 {
325           static char _dst[8192];
326           int i, j, i0;
327           static char hex[] ="0123456789abcdef";
328           char *o;  /* output position */
329 
330 #define P_HI(x)     hex[((x) & 0xf0)>>4]
331 #define P_LO(x)     hex[((x) & 0xf)]
332 #define P_C(x)      ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
333           if (!dst)
334                     dst = _dst;
335           if (lim <= 0 || lim > len)
336                     lim = len;
337           o = dst;
338           ksprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
339           o += strlen(o);
340           /* hexdump routine */
341           for (i = 0; i < lim; ) {
342                     ksprintf(o, "%5d: ", i);
343                     o += strlen(o);
344                     memset(o, ' ', 48);
345                     i0 = i;
346                     for (j=0; j < 16 && i < lim; i++, j++) {
347                               o[j*3] = P_HI(p[i]);
348                               o[j*3+1] = P_LO(p[i]);
349                     }
350                     i = i0;
351                     for (j=0; j < 16 && i < lim; i++, j++)
352                               o[j + 48] = P_C(p[i]);
353                     o[j+48] = '\n';
354                     o += j+49;
355           }
356           *o = '\0';
357 #undef P_HI
358 #undef P_LO
359 #undef P_C
360           return dst;
361 }
362 
363 
364 
365 /*
366  * Fetch configuration from the device, to cope with dynamic
367  * reconfigurations after loading the module.
368  */
369 int
netmap_update_config(struct netmap_adapter * na)370 netmap_update_config(struct netmap_adapter *na)
371 {
372           struct ifnet *ifp = na->ifp;
373           u_int txr, txd, rxr, rxd;
374 
375           txr = txd = rxr = rxd = 0;
376           if (na->nm_config) {
377                     na->nm_config(na, &txr, &txd, &rxr, &rxd);
378           } else {
379                     /* take whatever we had at init time */
380                     txr = na->num_tx_rings;
381                     txd = na->num_tx_desc;
382                     rxr = na->num_rx_rings;
383                     rxd = na->num_rx_desc;
384           }
385 
386           if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
387               na->num_rx_rings == rxr && na->num_rx_desc == rxd)
388                     return 0; /* nothing changed */
389           if (netmap_verbose || na->active_fds > 0) {
390                     D("stored config %s: txring %d x %d, rxring %d x %d",
391                               NM_IFPNAME(ifp),
392                               na->num_tx_rings, na->num_tx_desc,
393                               na->num_rx_rings, na->num_rx_desc);
394                     D("new config %s: txring %d x %d, rxring %d x %d",
395                               NM_IFPNAME(ifp), txr, txd, rxr, rxd);
396           }
397           if (na->active_fds == 0) {
398                     D("configuration changed (but fine)");
399                     na->num_tx_rings = txr;
400                     na->num_tx_desc = txd;
401                     na->num_rx_rings = rxr;
402                     na->num_rx_desc = rxd;
403                     return 0;
404           }
405           D("configuration changed while active, this is bad...");
406           return 1;
407 }
408 
409 
410 int
netmap_krings_create(struct netmap_adapter * na,u_int ntx,u_int nrx,u_int tailroom)411 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
412 {
413           u_int i, len, ndesc;
414           struct netmap_kring *kring;
415 
416           len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
417 
418           na->tx_rings = kmalloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
419           if (na->tx_rings == NULL) {
420                     D("Cannot allocate krings");
421                     return ENOMEM;
422           }
423           na->rx_rings = na->tx_rings + ntx;
424 
425           ndesc = na->num_tx_desc;
426           for (i = 0; i < ntx; i++) { /* Transmit rings */
427                     kring = &na->tx_rings[i];
428                     bzero(kring, sizeof(*kring));
429                     kring->na = na;
430                     kring->nkr_num_slots = ndesc;
431                     /*
432                      * IMPORTANT:
433                      * Always keep one slot empty, so we can detect new
434                      * transmissions comparing cur and nr_hwcur (they are
435                      * the same only if there are no new transmissions).
436                      */
437                     kring->nr_hwavail = ndesc - 1;
438                     lockinit(&kring->q_lock, "nm_txq_lock", 0, LK_CANRECURSE);
439                     init_waitqueue_head(&kring->si);
440           }
441 
442           ndesc = na->num_rx_desc;
443           for (i = 0; i < nrx; i++) { /* Receive rings */
444                     kring = &na->rx_rings[i];
445                     bzero(kring, sizeof(*kring));
446                     kring->na = na;
447                     kring->nkr_num_slots = ndesc;
448                     lockinit(&kring->q_lock, "nm_rxq_lock", 0, LK_CANRECURSE);
449                     init_waitqueue_head(&kring->si);
450           }
451           init_waitqueue_head(&na->tx_si);
452           init_waitqueue_head(&na->rx_si);
453 
454           na->tailroom = na->rx_rings + nrx;
455 
456           return 0;
457 
458 }
459 
460 
461 void
netmap_krings_delete(struct netmap_adapter * na)462 netmap_krings_delete(struct netmap_adapter *na)
463 {
464           int i;
465 
466           for (i = 0; i < na->num_tx_rings + 1; i++) {
467                     lockuninit(&na->tx_rings[i].q_lock);
468           }
469           for (i = 0; i < na->num_rx_rings + 1; i++) {
470                     lockuninit(&na->rx_rings[i].q_lock);
471           }
472           kfree(na->tx_rings, M_DEVBUF);
473           na->tx_rings = na->rx_rings = na->tailroom = NULL;
474 }
475 
476 
477 static struct netmap_if*
netmap_if_new(const char * ifname,struct netmap_adapter * na)478 netmap_if_new(const char *ifname, struct netmap_adapter *na)
479 {
480           struct netmap_if *nifp;
481 
482           if (netmap_update_config(na)) {
483                     /* configuration mismatch, report and fail */
484                     return NULL;
485           }
486 
487           if (na->active_fds)
488                     goto final;
489 
490           if (na->nm_krings_create(na))
491                     goto cleanup;
492 
493           if (netmap_mem_rings_create(na))
494                     goto cleanup;
495 
496 final:
497 
498           nifp = netmap_mem_if_new(ifname, na);
499           if (nifp == NULL)
500                     goto cleanup;
501 
502           return (nifp);
503 
504 cleanup:
505 
506           if (na->active_fds == 0) {
507                     netmap_mem_rings_delete(na);
508                     na->nm_krings_delete(na);
509           }
510 
511           return NULL;
512 }
513 
514 
515 /* grab a reference to the memory allocator, if we don't have one already.  The
516  * reference is taken from the netmap_adapter registered with the priv.
517  *
518  */
519 static int
netmap_get_memory_locked(struct netmap_priv_d * p)520 netmap_get_memory_locked(struct netmap_priv_d* p)
521 {
522           struct netmap_mem_d *nmd;
523           int error = 0;
524 
525           if (p->np_na == NULL) {
526                     if (!netmap_mmap_unreg)
527                               return ENODEV;
528                     /* for compatibility with older versions of the API
529                      * we use the global allocator when no interface has been
530                      * registered
531                      */
532                     nmd = &nm_mem;
533           } else {
534                     nmd = p->np_na->nm_mem;
535           }
536           if (p->np_mref == NULL) {
537                     error = netmap_mem_finalize(nmd);
538                     if (!error)
539                               p->np_mref = nmd;
540           } else if (p->np_mref != nmd) {
541                     /* a virtual port has been registered, but previous
542                      * syscalls already used the global allocator.
543                      * We cannot continue
544                      */
545                     error = ENODEV;
546           }
547           return error;
548 }
549 
550 
551 int
netmap_get_memory(struct netmap_priv_d * p)552 netmap_get_memory(struct netmap_priv_d* p)
553 {
554           int error;
555           NMG_LOCK();
556           error = netmap_get_memory_locked(p);
557           NMG_UNLOCK();
558           return error;
559 }
560 
561 
562 static int
netmap_have_memory_locked(struct netmap_priv_d * p)563 netmap_have_memory_locked(struct netmap_priv_d* p)
564 {
565           return p->np_mref != NULL;
566 }
567 
568 
569 static void
netmap_drop_memory_locked(struct netmap_priv_d * p)570 netmap_drop_memory_locked(struct netmap_priv_d* p)
571 {
572           if (p->np_mref) {
573                     netmap_mem_deref(p->np_mref);
574                     p->np_mref = NULL;
575           }
576 }
577 
578 
579 /*
580  * File descriptor's private data destructor.
581  *
582  * Call nm_register(ifp,0) to stop netmap mode on the interface and
583  * revert to normal operation. We expect that np_na->ifp has not gone.
584  * The second argument is the nifp to work on. In some cases it is
585  * not attached yet to the netmap_priv_d so we need to pass it as
586  * a separate argument.
587  */
588 /* call with NMG_LOCK held */
589 static void
netmap_do_unregif(struct netmap_priv_d * priv,struct netmap_if * nifp)590 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
591 {
592           struct netmap_adapter *na = priv->np_na;
593           struct ifnet *ifp = na->ifp;
594 
595           NMG_LOCK_ASSERT();
596           na->active_fds--;
597           if (na->active_fds <= 0) {    /* last instance */
598 
599                     if (netmap_verbose)
600                               D("deleting last instance for %s", NM_IFPNAME(ifp));
601                     /*
602                      * (TO CHECK) This function is only called
603                      * when the last reference to this file descriptor goes
604                      * away. This means we cannot have any pending poll()
605                      * or interrupt routine operating on the structure.
606                      * XXX The file may be closed in a thread while
607                      * another thread is using it.
608                      * Linux keeps the file opened until the last reference
609                      * by any outstanding ioctl/poll or mmap is gone.
610                      * FreeBSD does not track mmap()s (but we do) and
611                      * wakes up any sleeping poll(). Need to check what
612                      * happens if the close() occurs while a concurrent
613                      * syscall is running.
614                      */
615                     if (ifp)
616                               na->nm_register(na, 0); /* off, clear IFCAP_NETMAP */
617                     /* Wake up any sleeping threads. netmap_poll will
618                      * then return POLLERR
619                      * XXX The wake up now must happen during *_down(), when
620                      * we order all activities to stop. -gl
621                      */
622                     /* XXX kqueue(9) needed; these will mirror knlist_init. */
623                     /* knlist_destroy(&na->tx_si.si_note); */
624                     /* knlist_destroy(&na->rx_si.si_note); */
625 
626                     /* delete rings and buffers */
627                     netmap_mem_rings_delete(na);
628                     na->nm_krings_delete(na);
629           }
630           /* delete the nifp */
631           netmap_mem_if_delete(na, nifp);
632 }
633 
634 
635 /*
636  * returns 1 if this is the last instance and we can free priv
637  */
638 int
netmap_dtor_locked(struct netmap_priv_d * priv)639 netmap_dtor_locked(struct netmap_priv_d *priv)
640 {
641           struct netmap_adapter *na = priv->np_na;
642 
643           /*
644            * np_refcount is the number of active mmaps on
645            * this file descriptor
646            */
647           if (--priv->np_refcount > 0) {
648                     return 0;
649           }
650           if (!na) {
651               return 1; //XXX is it correct?
652           }
653           netmap_do_unregif(priv, priv->np_nifp);
654           priv->np_nifp = NULL;
655           netmap_drop_memory_locked(priv);
656           if (priv->np_na) {
657                     netmap_adapter_put(na);
658                     priv->np_na = NULL;
659           }
660           return 1;
661 }
662 
663 
664 void
netmap_dtor(void * data)665 netmap_dtor(void *data)
666 {
667           struct netmap_priv_d *priv = data;
668           int last_instance;
669 
670           NMG_LOCK();
671           last_instance = netmap_dtor_locked(priv);
672           NMG_UNLOCK();
673           if (last_instance) {
674                     bzero(priv, sizeof(*priv));   /* for safety */
675                     kfree(priv, M_DEVBUF);
676           }
677 }
678 
679 
680 
681 
682 /*
683  * Handlers for synchronization of the queues from/to the host.
684  * Netmap has two operating modes:
685  * - in the default mode, the rings connected to the host stack are
686  *   just another ring pair managed by userspace;
687  * - in transparent mode (XXX to be defined) incoming packets
688  *   (from the host or the NIC) are marked as NS_FORWARD upon
689  *   arrival, and the user application has a chance to reset the
690  *   flag for packets that should be dropped.
691  *   On the RXSYNC or poll(), packets in RX rings between
692  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
693  *   to the other side.
694  * The transfer NIC --> host is relatively easy, just encapsulate
695  * into mbufs and we are done. The host --> NIC side is slightly
696  * harder because there might not be room in the tx ring so it
697  * might take a while before releasing the buffer.
698  */
699 
700 
701 /*
702  * pass a chain of buffers to the host stack as coming from 'dst'
703  */
704 static void
netmap_send_up(struct ifnet * dst,struct mbq * q)705 netmap_send_up(struct ifnet *dst, struct mbq *q)
706 {
707           struct mbuf *m;
708 
709           /* send packets up, outside the lock */
710           while ((m = mbq_dequeue(q)) != NULL) {
711                     if (netmap_verbose & NM_VERB_HOST)
712                               D("sending up pkt %p size %d", m, MBUF_LEN(m));
713                     NM_SEND_UP(dst, m);
714           }
715           mbq_destroy(q);
716 }
717 
718 
719 /*
720  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
721  * Run from hwcur to cur - reserved
722  */
723 static void
netmap_grab_packets(struct netmap_kring * kring,struct mbq * q,int force)724 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
725 {
726           /* Take packets from hwcur to cur-reserved and pass them up.
727            * In case of no buffers we give up. At the end of the loop,
728            * the queue is drained in all cases.
729            * XXX handle reserved
730            */
731           u_int lim = kring->nkr_num_slots - 1;
732           struct mbuf *m;
733           u_int k = kring->ring->cur, n = kring->ring->reserved;
734           struct netmap_adapter *na = kring->na;
735 
736           /* compute the final position, ring->cur - ring->reserved */
737           if (n > 0) {
738                     if (k < n)
739                               k += kring->nkr_num_slots;
740                     k += n;
741           }
742           for (n = kring->nr_hwcur; n != k;) {
743                     struct netmap_slot *slot = &kring->ring->slot[n];
744 
745                     n = nm_next(n, lim);
746                     if ((slot->flags & NS_FORWARD) == 0 && !force)
747                               continue;
748                     if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
749                               D("bad pkt at %d len %d", n, slot->len);
750                               continue;
751                     }
752                     slot->flags &= ~NS_FORWARD; // XXX needed ?
753                     /* XXX adapt to the case of a multisegment packet */
754                     m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp);
755 
756                     if (m == NULL)
757                               break;
758                     mbq_enqueue(q, m);
759           }
760 }
761 
762 
763 /*
764  * The host ring has packets from nr_hwcur to (cur - reserved)
765  * to be sent down to the NIC.
766  * We need to use the queue lock on the source (host RX ring)
767  * to protect against netmap_transmit.
768  * If the user is well behaved we do not need to acquire locks
769  * on the destination(s),
770  * so we only need to make sure that there are no panics because
771  * of user errors.
772  * XXX verify
773  *
774  * We scan the tx rings, which have just been
775  * flushed so nr_hwcur == cur. Pushing packets down means
776  * increment cur and decrement avail.
777  * XXX to be verified
778  */
779 static void
netmap_sw_to_nic(struct netmap_adapter * na)780 netmap_sw_to_nic(struct netmap_adapter *na)
781 {
782           struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
783           struct netmap_kring *k1 = &na->tx_rings[0];
784           u_int i, howmany, src_lim, dst_lim;
785 
786           /* XXX we should also check that the carrier is on */
787           if (kring->nkr_stopped)
788                     return;
789 
790           lockmgr(&kring->q_lock, LK_EXCLUSIVE);
791 
792           if (kring->nkr_stopped)
793                     goto out;
794 
795           howmany = kring->nr_hwavail;  /* XXX otherwise cur - reserved - nr_hwcur */
796 
797           src_lim = kring->nkr_num_slots - 1;
798           for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
799                     ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
800                     dst_lim = k1->nkr_num_slots - 1;
801                     while (howmany > 0 && k1->ring->avail > 0) {
802                               struct netmap_slot *src, *dst, tmp;
803                               src = &kring->ring->slot[kring->nr_hwcur];
804                               dst = &k1->ring->slot[k1->ring->cur];
805                               tmp = *src;
806                               src->buf_idx = dst->buf_idx;
807                               src->flags = NS_BUF_CHANGED;
808 
809                               dst->buf_idx = tmp.buf_idx;
810                               dst->len = tmp.len;
811                               dst->flags = NS_BUF_CHANGED;
812                               ND("out len %d buf %d from %d to %d",
813                                         dst->len, dst->buf_idx,
814                                         kring->nr_hwcur, k1->ring->cur);
815 
816                               kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
817                               howmany--;
818                               kring->nr_hwavail--;
819                               k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
820                               k1->ring->avail--;
821                     }
822                     kring->ring->cur = kring->nr_hwcur; // XXX
823                     k1++; // XXX why?
824           }
825 out:
826           lockmgr(&kring->q_lock, LK_RELEASE);
827 }
828 
829 
830 /*
831  * netmap_txsync_to_host() passes packets up. We are called from a
832  * system call in user process context, and the only contention
833  * can be among multiple user threads erroneously calling
834  * this routine concurrently.
835  */
836 void
netmap_txsync_to_host(struct netmap_adapter * na)837 netmap_txsync_to_host(struct netmap_adapter *na)
838 {
839           struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
840           struct netmap_ring *ring = kring->ring;
841           u_int k, lim = kring->nkr_num_slots - 1;
842           struct mbq q;
843           int error;
844 
845           error = nm_kr_tryget(kring);
846           if (error) {
847                     if (error == NM_KR_BUSY)
848                               D("ring %p busy (user error)", kring);
849                     return;
850           }
851           k = ring->cur;
852           if (k > lim) {
853                     D("invalid ring index in stack TX kring %p", kring);
854                     netmap_ring_reinit(kring);
855                     nm_kr_put(kring);
856                     return;
857           }
858 
859           /* Take packets from hwcur to cur and pass them up.
860            * In case of no buffers we give up. At the end of the loop,
861            * the queue is drained in all cases.
862            */
863           mbq_init(&q);
864           netmap_grab_packets(kring, &q, 1);
865           kring->nr_hwcur = k;
866           kring->nr_hwavail = ring->avail = lim;
867 
868           nm_kr_put(kring);
869           netmap_send_up(na->ifp, &q);
870 }
871 
872 
873 /*
874  * rxsync backend for packets coming from the host stack.
875  * They have been put in the queue by netmap_transmit() so we
876  * need to protect access to the kring using a lock.
877  *
878  * This routine also does the selrecord if called from the poll handler
879  * (we know because td != NULL).
880  *
881  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
882  *     as an additional hidden argument.
883  */
884 static void
netmap_rxsync_from_host(struct netmap_adapter * na,struct thread * td,void * pwait)885 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
886 {
887           struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
888           struct netmap_ring *ring = kring->ring;
889           u_int j, n, lim = kring->nkr_num_slots;
890           u_int k = ring->cur, resvd = ring->reserved;
891 
892           (void)pwait;        /* disable unused warnings */
893 
894           if (kring->nkr_stopped) /* check a first time without lock */
895                     return;
896 
897           lockmgr(&kring->q_lock, LK_EXCLUSIVE);
898 
899           if (kring->nkr_stopped)  /* check again with lock held */
900                     goto unlock_out;
901 
902           if (k >= lim) {
903                     netmap_ring_reinit(kring);
904                     goto unlock_out;
905           }
906           /* new packets are already set in nr_hwavail */
907           /* skip past packets that userspace has released */
908           j = kring->nr_hwcur;
909           if (resvd > 0) {
910                     if (resvd + ring->avail >= lim + 1) {
911                               D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
912                               ring->reserved = resvd = 0; // XXX panic...
913                     }
914                     k = (k >= resvd) ? k - resvd : k + lim - resvd;
915           }
916           if (j != k) {
917                     n = k >= j ? k - j : k + lim - j;
918                     kring->nr_hwavail -= n;
919                     kring->nr_hwcur = k;
920           }
921           k = ring->avail = kring->nr_hwavail - resvd;
922           if (k == 0 && td)
923                     KNOTE(&kring->si.ki_note, 0);
924           if (k && (netmap_verbose & NM_VERB_HOST))
925                     D("%d pkts from stack", k);
926 unlock_out:
927 
928           lockmgr(&kring->q_lock, LK_RELEASE);
929 }
930 
931 
932 /* Get a netmap adapter for the port.
933  *
934  * If it is possible to satisfy the request, return 0
935  * with *na containing the netmap adapter found.
936  * Otherwise return an error code, with *na containing NULL.
937  *
938  * When the port is attached to a bridge, we always return
939  * EBUSY.
940  * Otherwise, if the port is already bound to a file descriptor,
941  * then we unconditionally return the existing adapter into *na.
942  * In all the other cases, we return (into *na) either native,
943  * generic or NULL, according to the following table:
944  *
945  *                                                native_support
946  * active_fds   dev.netmap.admode         YES     NO
947  * -------------------------------------------------------
948  *    >0              *                 NA(ifp) NA(ifp)
949  *
950  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
951  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
952  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
953  *
954  */
955 
956 int
netmap_get_hw_na(struct ifnet * ifp,struct netmap_adapter ** na)957 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
958 {
959           /* generic support */
960           int i = netmap_admode;        /* Take a snapshot. */
961           int error = 0;
962           struct netmap_adapter *prev_na;
963           struct netmap_generic_adapter *gna;
964 
965           *na = NULL; /* default */
966 
967           /* reset in case of invalid value */
968           if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
969                     i = netmap_admode = NETMAP_ADMODE_BEST;
970 
971           if (NETMAP_CAPABLE(ifp)) {
972                     /* If an adapter already exists, but is
973                      * attached to a vale port, we report that the
974                      * port is busy.
975                      */
976                     if (NETMAP_OWNED_BY_KERN(NA(ifp)))
977                               return EBUSY;
978 
979                     /* If an adapter already exists, return it if
980                      * there are active file descriptors or if
981                      * netmap is not forced to use generic
982                      * adapters.
983                      */
984                     if (NA(ifp)->active_fds > 0 ||
985                                         i != NETMAP_ADMODE_GENERIC) {
986                               *na = NA(ifp);
987                               return 0;
988                     }
989           }
990 
991           /* If there isn't native support and netmap is not allowed
992            * to use generic adapters, we cannot satisfy the request.
993            */
994           if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
995                     return EINVAL;
996 
997           /* Otherwise, create a generic adapter and return it,
998            * saving the previously used netmap adapter, if any.
999            *
1000            * Note that here 'prev_na', if not NULL, MUST be a
1001            * native adapter, and CANNOT be a generic one. This is
1002            * true because generic adapters are created on demand, and
1003            * destroyed when not used anymore. Therefore, if the adapter
1004            * currently attached to an interface 'ifp' is generic, it
1005            * must be that
1006            * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1007            * Consequently, if NA(ifp) is generic, we will enter one of
1008            * the branches above. This ensures that we never override
1009            * a generic adapter with another generic adapter.
1010            */
1011           prev_na = NA(ifp);
1012           error = generic_netmap_attach(ifp);
1013           if (error)
1014                     return error;
1015 
1016           *na = NA(ifp);
1017           gna = (struct netmap_generic_adapter*)NA(ifp);
1018           gna->prev = prev_na; /* save old na */
1019           if (prev_na != NULL) {
1020                     // XXX add a refcount ?
1021                     netmap_adapter_get(prev_na);
1022           }
1023           D("Created generic NA %p (prev %p)", gna, gna->prev);
1024 
1025           return 0;
1026 }
1027 
1028 
1029 /*
1030  * MUST BE CALLED UNDER NMG_LOCK()
1031  *
1032  * get a refcounted reference to an interface.
1033  * This is always called in the execution of an ioctl().
1034  *
1035  * Return ENXIO if the interface does not exist, EINVAL if netmap
1036  * is not supported by the interface.
1037  * If successful, hold a reference.
1038  *
1039  * When the NIC is attached to a bridge, reference is managed
1040  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1041  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1042  * is detached from the bridge, then ifp's refcount is dropped (this
1043  * is equivalent to that ifp is destroyed in case of virtual ports.
1044  *
1045  * This function uses if_rele() when we want to prevent the NIC from
1046  * being detached from the bridge in error handling.  But once refcount
1047  * is acquired by this function, it must be released using nm_if_rele().
1048  */
1049 int
netmap_get_na(struct nmreq * nmr,struct netmap_adapter ** na,int create)1050 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1051 {
1052           struct ifnet *ifp;
1053           int error = 0;
1054           struct netmap_adapter *ret;
1055 
1056           *na = NULL;     /* default return value */
1057 
1058           /* first try to see if this is a bridge port. */
1059           NMG_LOCK_ASSERT();
1060 
1061           error = netmap_get_bdg_na(nmr, na, create);
1062           if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
1063                     return error;
1064 
1065           ifnet_lock();
1066 
1067           ifp = ifunit(nmr->nr_name);
1068           if (ifp == NULL) {
1069                     error = ENXIO;
1070                     goto out;
1071           }
1072 
1073           error = netmap_get_hw_na(ifp, &ret);
1074           if (error)
1075                     goto out;
1076 
1077           if (ret != NULL) {
1078                     /* Users cannot use the NIC attached to a bridge directly */
1079                     if (NETMAP_OWNED_BY_KERN(ret)) {
1080                               error = EINVAL;
1081                               goto out;
1082                     }
1083                     error = 0;
1084                     *na = ret;
1085                     netmap_adapter_get(ret);
1086           }
1087 out:
1088           ifnet_unlock();
1089           return error;
1090 }
1091 
1092 
1093 /*
1094  * Error routine called when txsync/rxsync detects an error.
1095  * Can't do much more than resetting cur = hwcur, avail = hwavail.
1096  * Return 1 on reinit.
1097  *
1098  * This routine is only called by the upper half of the kernel.
1099  * It only reads hwcur (which is changed only by the upper half, too)
1100  * and hwavail (which may be changed by the lower half, but only on
1101  * a tx ring and only to increase it, so any error will be recovered
1102  * on the next call). For the above, we don't strictly need to call
1103  * it under lock.
1104  */
1105 int
netmap_ring_reinit(struct netmap_kring * kring)1106 netmap_ring_reinit(struct netmap_kring *kring)
1107 {
1108           struct netmap_ring *ring = kring->ring;
1109           u_int i, lim = kring->nkr_num_slots - 1;
1110           int errors = 0;
1111 
1112           // XXX KASSERT nm_kr_tryget
1113           RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1114           if (ring->cur > lim)
1115                     errors++;
1116           for (i = 0; i <= lim; i++) {
1117                     u_int idx = ring->slot[i].buf_idx;
1118                     u_int len = ring->slot[i].len;
1119                     if (idx < 2 || idx >= netmap_total_buffers) {
1120                               if (!errors++)
1121                                         D("bad buffer at slot %d idx %d len %d ", i, idx, len);
1122                               ring->slot[i].buf_idx = 0;
1123                               ring->slot[i].len = 0;
1124                     } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1125                               ring->slot[i].len = 0;
1126                               if (!errors++)
1127                                         D("bad len %d at slot %d idx %d",
1128                                                   len, i, idx);
1129                     }
1130           }
1131           if (errors) {
1132                     int pos = kring - kring->na->tx_rings;
1133                     int n = kring->na->num_tx_rings + 1;
1134 
1135                     RD(10, "total %d errors", errors);
1136                     errors++;
1137                     RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1138                               NM_IFPNAME(kring->na->ifp),
1139                               pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
1140                               ring->cur, kring->nr_hwcur,
1141                               ring->avail, kring->nr_hwavail);
1142                     ring->cur = kring->nr_hwcur;
1143                     ring->avail = kring->nr_hwavail;
1144           }
1145           return (errors ? 1 : 0);
1146 }
1147 
1148 
1149 /*
1150  * Set the ring ID. For devices with a single queue, a request
1151  * for all rings is the same as a single ring.
1152  */
1153 static int
netmap_set_ringid(struct netmap_priv_d * priv,u_int ringid)1154 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1155 {
1156           struct netmap_adapter *na = priv->np_na;
1157           struct ifnet *ifp = na->ifp;
1158           u_int i = ringid & NETMAP_RING_MASK;
1159           /* initially (np_qfirst == np_qlast) we don't want to lock */
1160           u_int lim = na->num_rx_rings;
1161 
1162           if (na->num_tx_rings > lim)
1163                     lim = na->num_tx_rings;
1164           if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1165                     D("invalid ring id %d", i);
1166                     return (EINVAL);
1167           }
1168           priv->np_ringid = ringid;
1169           if (ringid & NETMAP_SW_RING) {
1170                     priv->np_qfirst = NETMAP_SW_RING;
1171                     priv->np_qlast = 0;
1172           } else if (ringid & NETMAP_HW_RING) {
1173                     priv->np_qfirst = i;
1174                     priv->np_qlast = i + 1;
1175           } else {
1176                     priv->np_qfirst = 0;
1177                     priv->np_qlast = NETMAP_HW_RING ;
1178           }
1179           priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1180     if (netmap_verbose) {
1181           if (ringid & NETMAP_SW_RING)
1182                     D("ringid %s set to SW RING", NM_IFPNAME(ifp));
1183           else if (ringid & NETMAP_HW_RING)
1184                     D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
1185                               priv->np_qfirst);
1186           else
1187                     D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
1188     }
1189           return 0;
1190 }
1191 
1192 
1193 /*
1194  * possibly move the interface to netmap-mode.
1195  * If success it returns a pointer to netmap_if, otherwise NULL.
1196  * This must be called with NMG_LOCK held.
1197  */
1198 struct netmap_if *
netmap_do_regif(struct netmap_priv_d * priv,struct netmap_adapter * na,uint16_t ringid,int * err)1199 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1200           uint16_t ringid, int *err)
1201 {
1202           struct ifnet *ifp = na->ifp;
1203           struct netmap_if *nifp = NULL;
1204           int error, need_mem = 0;
1205 
1206           NMG_LOCK_ASSERT();
1207           /* ring configuration may have changed, fetch from the card */
1208           netmap_update_config(na);
1209           priv->np_na = na;     /* store the reference */
1210           error = netmap_set_ringid(priv, ringid);
1211           if (error)
1212                     goto out;
1213           /* ensure allocators are ready */
1214           need_mem = !netmap_have_memory_locked(priv);
1215           if (need_mem) {
1216                     error = netmap_get_memory_locked(priv);
1217                     ND("get_memory returned %d", error);
1218                     if (error)
1219                               goto out;
1220           }
1221           nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1222           if (nifp == NULL) { /* allocation failed */
1223                     /* we should drop the allocator, but only
1224                      * if we were the ones who grabbed it
1225                      */
1226                     error = ENOMEM;
1227                     goto out;
1228           }
1229           na->active_fds++;
1230           if (ifp->if_capenable & IFCAP_NETMAP) {
1231                     /* was already set */
1232           } else {
1233                     /* Otherwise set the card in netmap mode
1234                      * and make it use the shared buffers.
1235                      *
1236                      * do not core lock because the race is harmless here,
1237                      * there cannot be any traffic to netmap_transmit()
1238                      */
1239                     na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1240                     ND("%p->na_lut == %p", na, na->na_lut);
1241                     na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1242                     error = na->nm_register(na, 1); /* mode on */
1243                     if (error) {
1244                               netmap_do_unregif(priv, nifp);
1245                               nifp = NULL;
1246                     }
1247           }
1248 out:
1249           *err = error;
1250           if (error) {
1251                     priv->np_na = NULL;
1252                     if (need_mem)
1253                               netmap_drop_memory_locked(priv);
1254           }
1255           if (nifp != NULL) {
1256                     /*
1257                      * advertise that the interface is ready bt setting ni_nifp.
1258                      * The barrier is needed because readers (poll and *SYNC)
1259                      * check for priv->np_nifp != NULL without locking
1260                      */
1261                     wmb(); /* make sure previous writes are visible to all CPUs */
1262                     priv->np_nifp = nifp;
1263           }
1264           return nifp;
1265 }
1266 
1267 
1268 
1269 /*
1270  * ioctl(2) support for the "netmap" device.
1271  *
1272  * Following a list of accepted commands:
1273  * - NIOCGINFO
1274  * - SIOCGIFADDR    just for convenience
1275  * - NIOCREGIF
1276  * - NIOCUNREGIF
1277  * - NIOCTXSYNC
1278  * - NIOCRXSYNC
1279  *
1280  * Return 0 on success, errno otherwise.
1281  */
1282 int
netmap_ioctl(struct dev_ioctl_args * ap)1283 netmap_ioctl(struct dev_ioctl_args *ap)
1284 {
1285           struct netmap_priv_d *priv = NULL;
1286           struct ifnet *ifp = NULL;
1287           struct nmreq *nmr = (struct nmreq *) ap->a_data;
1288           struct netmap_adapter *na = NULL;
1289           int error;
1290           u_int i, lim;
1291           struct netmap_if *nifp;
1292           struct netmap_kring *krings;
1293           u_long cmd = ap->a_cmd;
1294 
1295           error = devfs_get_cdevpriv(ap->a_fp, (void **)&priv);
1296           if (error) {
1297                     /* XXX ENOENT should be impossible, since the priv
1298                      * is now created in the open */
1299                     return (error == ENOENT ? ENXIO : error);
1300           }
1301 
1302           nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';    /* truncate name */
1303           switch (cmd) {
1304           case NIOCGINFO:               /* return capabilities etc */
1305                     if (nmr->nr_version != NETMAP_API) {
1306                               D("API mismatch got %d have %d",
1307                                         nmr->nr_version, NETMAP_API);
1308                               nmr->nr_version = NETMAP_API;
1309                               error = EINVAL;
1310                               break;
1311                     }
1312                     if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1313                               error = netmap_bdg_ctl(nmr, NULL);
1314                               break;
1315                     }
1316 
1317                     NMG_LOCK();
1318                     do {
1319                               /* memsize is always valid */
1320                               struct netmap_mem_d *nmd = &nm_mem;
1321                               u_int memflags;
1322 
1323                               if (nmr->nr_name[0] != '\0') {
1324                                         /* get a refcount */
1325                                         error = netmap_get_na(nmr, &na, 1 /* create */);
1326                                         if (error)
1327                                                   break;
1328                                         nmd = na->nm_mem; /* get memory allocator */
1329                               }
1330 
1331                               error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
1332                               if (error)
1333                                         break;
1334                               if (na == NULL) /* only memory info */
1335                                         break;
1336                               nmr->nr_offset = 0;
1337                               nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1338                               netmap_update_config(na);
1339                               nmr->nr_rx_rings = na->num_rx_rings;
1340                               nmr->nr_tx_rings = na->num_tx_rings;
1341                               nmr->nr_rx_slots = na->num_rx_desc;
1342                               nmr->nr_tx_slots = na->num_tx_desc;
1343                               if (memflags & NETMAP_MEM_PRIVATE)
1344                                         nmr->nr_ringid |= NETMAP_PRIV_MEM;
1345                               netmap_adapter_put(na);
1346                     } while (0);
1347                     NMG_UNLOCK();
1348                     break;
1349 
1350           case NIOCREGIF:
1351                     if (nmr->nr_version != NETMAP_API) {
1352                               nmr->nr_version = NETMAP_API;
1353                               error = EINVAL;
1354                               break;
1355                     }
1356                     /* possibly attach/detach NIC and VALE switch */
1357                     i = nmr->nr_cmd;
1358                     if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
1359                               error = netmap_bdg_ctl(nmr, NULL);
1360                               break;
1361                     } else if (i != 0) {
1362                               D("nr_cmd must be 0 not %d", i);
1363                               error = EINVAL;
1364                               break;
1365                     }
1366 
1367                     /* protect access to priv from concurrent NIOCREGIF */
1368                     NMG_LOCK();
1369                     do {
1370                               u_int memflags;
1371 
1372                               if (priv->np_na != NULL) {    /* thread already registered */
1373                                         error = netmap_set_ringid(priv, nmr->nr_ringid);
1374                                         break;
1375                               }
1376                               /* find the interface and a reference */
1377                               error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1378                               if (error)
1379                                         break;
1380                               ifp = na->ifp;
1381                               if (NETMAP_OWNED_BY_KERN(na)) {
1382                                         netmap_adapter_put(na);
1383                                         error = EBUSY;
1384                                         break;
1385                               }
1386                               nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
1387                               if (!nifp) {    /* reg. failed, release priv and ref */
1388                                         netmap_adapter_put(na);
1389                                         priv->np_nifp = NULL;
1390                                         break;
1391                               }
1392 
1393                               /* return the offset of the netmap_if object */
1394                               nmr->nr_rx_rings = na->num_rx_rings;
1395                               nmr->nr_tx_rings = na->num_tx_rings;
1396                               nmr->nr_rx_slots = na->num_rx_desc;
1397                               nmr->nr_tx_slots = na->num_tx_desc;
1398                               error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
1399                               if (error) {
1400                                         netmap_adapter_put(na);
1401                                         break;
1402                               }
1403                               if (memflags & NETMAP_MEM_PRIVATE) {
1404                                         nmr->nr_ringid |= NETMAP_PRIV_MEM;
1405                                         *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1406                               }
1407                               nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1408                     } while (0);
1409                     NMG_UNLOCK();
1410                     break;
1411 
1412           case NIOCUNREGIF:
1413                     // XXX we have no data here ?
1414                     D("deprecated, data is %p", nmr);
1415                     error = EINVAL;
1416                     break;
1417 
1418           case NIOCTXSYNC:
1419           case NIOCRXSYNC:
1420                     nifp = priv->np_nifp;
1421 
1422                     if (nifp == NULL) {
1423                               error = ENXIO;
1424                               break;
1425                     }
1426                     rmb(); /* make sure following reads are not from cache */
1427 
1428                     na = priv->np_na;      /* we have a reference */
1429 
1430                     if (na == NULL) {
1431                               D("Internal error: nifp != NULL && na == NULL");
1432                               error = ENXIO;
1433                               break;
1434                     }
1435 
1436                     ifp = na->ifp;
1437                     if (ifp == NULL) {
1438                               RD(1, "the ifp is gone");
1439                               error = ENXIO;
1440                               break;
1441                     }
1442 
1443                     if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
1444                               if (cmd == NIOCTXSYNC)
1445                                         netmap_txsync_to_host(na);
1446                               else
1447                                         netmap_rxsync_from_host(na, NULL, NULL);
1448                               break;
1449                     }
1450                     /* find the last ring to scan */
1451                     lim = priv->np_qlast;
1452                     if (lim == NETMAP_HW_RING)
1453                               lim = (cmd == NIOCTXSYNC) ?
1454                                   na->num_tx_rings : na->num_rx_rings;
1455 
1456                     krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
1457                     for (i = priv->np_qfirst; i < lim; i++) {
1458                               struct netmap_kring *kring = krings + i;
1459                               if (nm_kr_tryget(kring)) {
1460                                         error = EBUSY;
1461                                         goto out;
1462                               }
1463                               if (cmd == NIOCTXSYNC) {
1464                                         if (netmap_verbose & NM_VERB_TXSYNC)
1465                                                   D("pre txsync ring %d cur %d hwcur %d",
1466                                                       i, kring->ring->cur,
1467                                                       kring->nr_hwcur);
1468                                         na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
1469                                         if (netmap_verbose & NM_VERB_TXSYNC)
1470                                                   D("post txsync ring %d cur %d hwcur %d",
1471                                                       i, kring->ring->cur,
1472                                                       kring->nr_hwcur);
1473                               } else {
1474                                         na->nm_rxsync(na, i, NAF_FORCE_READ);
1475                                         microtime(&na->rx_rings[i].ring->ts);
1476                               }
1477                               nm_kr_put(kring);
1478                     }
1479 
1480                     break;
1481           case BIOCIMMEDIATE:
1482           case BIOCGHDRCMPLT:
1483           case BIOCSHDRCMPLT:
1484           case BIOCSSEESENT:
1485                     D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1486                     break;
1487 
1488           default:  /* allow device-specific ioctls */
1489               {
1490                     struct socket so;
1491 
1492                     bzero(&so, sizeof(so));
1493                     NMG_LOCK();
1494                     error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1495                     if (error) {
1496                               netmap_adapter_put(na);
1497                               NMG_UNLOCK();
1498                               break;
1499                     }
1500                     ifp = na->ifp;
1501                     // so->so_proto not null.
1502                     error = ifioctl(&so, cmd, ap->a_data, ap->a_cred);
1503                     netmap_adapter_put(na);
1504                     NMG_UNLOCK();
1505                     break;
1506               }
1507           }
1508 out:
1509 
1510           return (error);
1511 }
1512 
1513 static int
netmap_kqfilter_event(struct knote * kn,long hint)1514 netmap_kqfilter_event(struct knote *kn, long hint)
1515 {
1516           return (0);
1517 }
1518 
1519 static void
netmap_kqfilter_detach(struct knote * kn)1520 netmap_kqfilter_detach(struct knote *kn)
1521 {
1522 }
1523 
1524 static struct filterops netmap_kqfilter_ops = {
1525           FILTEROP_ISFD, NULL, netmap_kqfilter_detach, netmap_kqfilter_event,
1526 };
1527 
1528 int
netmap_kqfilter(struct dev_kqfilter_args * ap)1529 netmap_kqfilter(struct dev_kqfilter_args *ap)
1530 {
1531           struct knote *kn = ap->a_kn;
1532 
1533           ap->a_result = 0;
1534 
1535           switch (kn->kn_filter) {
1536           case EVFILT_READ:
1537           case EVFILT_WRITE:
1538                     kn->kn_fop = &netmap_kqfilter_ops;
1539                     break;
1540           default:
1541                     ap->a_result = EOPNOTSUPP;
1542                     return (0);
1543           }
1544 
1545           return (0);
1546 }
1547 
1548 /*
1549  * select(2) and poll(2) handlers for the "netmap" device.
1550  *
1551  * Can be called for one or more queues.
1552  * Return true the event mask corresponding to ready events.
1553  * If there are no ready events, do a selrecord on either individual
1554  * selinfo or on the global one.
1555  * Device-dependent parts (locking and sync of tx/rx rings)
1556  * are done through callbacks.
1557  *
1558  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1559  * The first one is remapped to pwait as selrecord() uses the name as an
1560  * hidden argument.
1561  */
1562 static inline int   /* XXX mute unused for now */
netmap_poll(struct cdev * dev,int events,struct thread * td)1563 netmap_poll(struct cdev *dev, int events, struct thread *td)
1564 {
1565           struct netmap_priv_d *priv = NULL;
1566           struct netmap_adapter *na;
1567           struct ifnet *ifp;
1568           struct netmap_kring *kring;
1569           u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1570           u_int lim_tx, lim_rx, host_forwarded = 0;
1571           struct mbq q;
1572           void *pwait = dev;  /* linux compatibility */
1573 
1574           /*
1575            * In order to avoid nested locks, we need to "double check"
1576            * txsync and rxsync if we decide to do a selrecord().
1577            * retry_tx (and retry_rx, later) prevent looping forever.
1578            */
1579           int retry_tx = 1;
1580 
1581           (void)pwait;
1582           mbq_init(&q);
1583 
1584           /* XXX poll isn't ported yet so fill in NULL as a placeholder: */
1585           if (devfs_get_cdevpriv(NULL, (void **)&priv) != 0 || priv == NULL)
1586                     return POLLERR;
1587 
1588           if (priv->np_nifp == NULL) {
1589                     D("No if registered");
1590                     return POLLERR;
1591           }
1592           rmb(); /* make sure following reads are not from cache */
1593 
1594           na = priv->np_na;
1595           ifp = na->ifp;
1596           // check for deleted
1597           if (ifp == NULL) {
1598                     RD(1, "the ifp is gone");
1599                     return POLLERR;
1600           }
1601 
1602           if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1603                     return POLLERR;
1604 
1605           if (netmap_verbose & 0x8000)
1606                     D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1607           want_tx = events & (POLLOUT | POLLWRNORM);
1608           want_rx = events & (POLLIN | POLLRDNORM);
1609 
1610           lim_tx = na->num_tx_rings;
1611           lim_rx = na->num_rx_rings;
1612 
1613           if (priv->np_qfirst == NETMAP_SW_RING) {
1614                     /* handle the host stack ring */
1615                     if (priv->np_txpoll || want_tx) {
1616                               /* push any packets up, then we are always ready */
1617                               netmap_txsync_to_host(na);
1618                               revents |= want_tx;
1619                     }
1620                     if (want_rx) {
1621                               kring = &na->rx_rings[lim_rx];
1622                               if (kring->ring->avail == 0)
1623                                         netmap_rxsync_from_host(na, td, dev);
1624                               if (kring->ring->avail > 0) {
1625                                         revents |= want_rx;
1626                               }
1627                     }
1628                     return (revents);
1629           }
1630 
1631           /*
1632            * If we are in transparent mode, check also the host rx ring
1633            * XXX Transparent mode at the moment requires to bind all
1634            * rings to a single file descriptor.
1635            */
1636           kring = &na->rx_rings[lim_rx];
1637           if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1638                               && want_rx
1639                               && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
1640                     if (kring->ring->avail == 0)
1641                               netmap_rxsync_from_host(na, td, dev);
1642                     if (kring->ring->avail > 0)
1643                               revents |= want_rx;
1644           }
1645 
1646           /*
1647            * check_all_{tx|rx} are set if the card has more than one queue AND
1648            * the file descriptor is bound to all of them. If so, we sleep on
1649            * the "global" selinfo, otherwise we sleep on individual selinfo
1650            * (FreeBSD only allows two selinfo's per file descriptor).
1651            * The interrupt routine in the driver wake one or the other
1652            * (or both) depending on which clients are active.
1653            *
1654            * rxsync() is only called if we run out of buffers on a POLLIN.
1655            * txsync() is called if we run out of buffers on POLLOUT, or
1656            * there are pending packets to send. The latter can be disabled
1657            * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1658            */
1659           check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
1660           check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
1661 
1662           if (priv->np_qlast != NETMAP_HW_RING) {
1663                     lim_tx = lim_rx = priv->np_qlast;
1664           }
1665 
1666           /*
1667            * We start with a lock free round which is cheap if we have
1668            * slots available. If this fails, then lock and call the sync
1669            * routines.
1670            */
1671           for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
1672                     kring = &na->rx_rings[i];
1673                     if (kring->ring->avail > 0) {
1674                               revents |= want_rx;
1675                               want_rx = 0;        /* also breaks the loop */
1676                     }
1677           }
1678           for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
1679                     kring = &na->tx_rings[i];
1680                     if (kring->ring->avail > 0) {
1681                               revents |= want_tx;
1682                               want_tx = 0;        /* also breaks the loop */
1683                     }
1684           }
1685 
1686           /*
1687            * If we to push packets out (priv->np_txpoll) or want_tx is
1688            * still set, we do need to run the txsync calls (on all rings,
1689            * to avoid that the tx rings stall).
1690            * XXX should also check cur != hwcur on the tx rings.
1691            * Fortunately, normal tx mode has np_txpoll set.
1692            */
1693           if (priv->np_txpoll || want_tx) {
1694                     /* If we really want to be woken up (want_tx),
1695                      * do a selrecord, either on the global or on
1696                      * the private structure.  Then issue the txsync
1697                      * so there is no race in the selrecord/selwait
1698                      */
1699 flush_tx:
1700                     for (i = priv->np_qfirst; i < lim_tx; i++) {
1701                               kring = &na->tx_rings[i];
1702                               /*
1703                                * Skip this ring if want_tx == 0
1704                                * (we have already done a successful sync on
1705                                * a previous ring) AND kring->cur == kring->hwcur
1706                                * (there are no pending transmissions for this ring).
1707                                */
1708                               if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1709                                         continue;
1710                               /* make sure only one user thread is doing this */
1711                               if (nm_kr_tryget(kring)) {
1712                                         ND("ring %p busy is %d",
1713                                             kring, (int)kring->nr_busy);
1714                                         revents |= POLLERR;
1715                                         goto out;
1716                               }
1717 
1718                               if (netmap_verbose & NM_VERB_TXSYNC)
1719                                         D("send %d on %s %d",
1720                                                   kring->ring->cur, NM_IFPNAME(ifp), i);
1721                               if (na->nm_txsync(na, i, 0))
1722                                         revents |= POLLERR;
1723 
1724                               /* Check avail/call selrecord only if called with POLLOUT */
1725                               if (want_tx) {
1726                                         if (kring->ring->avail > 0) {
1727                                                   /* stop at the first ring. We don't risk
1728                                                    * starvation.
1729                                                    */
1730                                                   revents |= want_tx;
1731                                                   want_tx = 0;
1732                                         }
1733                               }
1734                               nm_kr_put(kring);
1735                     }
1736                     if (want_tx && retry_tx) {
1737                               KNOTE(check_all_tx ? &na->tx_si.ki_note :
1738                                   &na->tx_rings[priv->np_qfirst].si.ki_note, 0);
1739                               retry_tx = 0;
1740                               goto flush_tx;
1741                     }
1742           }
1743 
1744           /*
1745            * now if want_rx is still set we need to lock and rxsync.
1746            * Do it on all rings because otherwise we starve.
1747            */
1748           if (want_rx) {
1749                     int retry_rx = 1;
1750 do_retry_rx:
1751                     for (i = priv->np_qfirst; i < lim_rx; i++) {
1752                               kring = &na->rx_rings[i];
1753 
1754                               if (nm_kr_tryget(kring)) {
1755                                         revents |= POLLERR;
1756                                         goto out;
1757                               }
1758 
1759                               /* XXX NR_FORWARD should only be read on
1760                                * physical or NIC ports
1761                                */
1762                               if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
1763                                         ND(10, "forwarding some buffers up %d to %d",
1764                                             kring->nr_hwcur, kring->ring->cur);
1765                                         netmap_grab_packets(kring, &q, netmap_fwd);
1766                               }
1767 
1768                               if (na->nm_rxsync(na, i, 0))
1769                                         revents |= POLLERR;
1770                               if (netmap_no_timestamp == 0 ||
1771                                                   kring->ring->flags & NR_TIMESTAMP) {
1772                                         microtime(&kring->ring->ts);
1773                               }
1774 
1775                               if (kring->ring->avail > 0) {
1776                                         revents |= want_rx;
1777                                         retry_rx = 0;
1778                               }
1779                               nm_kr_put(kring);
1780                     }
1781                     if (retry_rx) {
1782                               retry_rx = 0;
1783                               KNOTE(check_all_rx ? &na->rx_si.ki_note :
1784                                   &na->rx_rings[priv->np_qfirst].si.ki_note, 0);
1785                               goto do_retry_rx;
1786                     }
1787           }
1788 
1789           /* forward host to the netmap ring.
1790            * I am accessing nr_hwavail without lock, but netmap_transmit
1791            * can only increment it, so the operation is safe.
1792            */
1793           kring = &na->rx_rings[lim_rx];
1794           if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1795                               && (netmap_fwd || kring->ring->flags & NR_FORWARD)
1796                                && kring->nr_hwavail > 0 && !host_forwarded) {
1797                     netmap_sw_to_nic(na);
1798                     host_forwarded = 1; /* prevent another pass */
1799                     want_rx = 0;
1800                     goto flush_tx;
1801           }
1802 
1803           if (q.head)
1804                     netmap_send_up(na->ifp, &q);
1805 
1806 out:
1807 
1808           return (revents);
1809 }
1810 
1811 /*------- driver support routines ------*/
1812 
1813 static int netmap_hw_krings_create(struct netmap_adapter *);
1814 
1815 static int
netmap_notify(struct netmap_adapter * na,u_int n_ring,enum txrx tx,int flags)1816 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags)
1817 {
1818           struct netmap_kring *kring;
1819 
1820           if (tx == NR_TX) {
1821                     kring = na->tx_rings + n_ring;
1822                     KNOTE(&kring->si.ki_note, 0);
1823                     wakeup(&kring->si.ki_note);
1824                     if (flags & NAF_GLOBAL_NOTIFY)
1825                               wakeup(&na->tx_si.ki_note);
1826           } else {
1827                     kring = na->rx_rings + n_ring;
1828                     KNOTE(&kring->si.ki_note, 0);
1829                     wakeup(&kring->si.ki_note);
1830                     if (flags & NAF_GLOBAL_NOTIFY)
1831                               wakeup(&na->rx_si.ki_note);
1832           }
1833           return 0;
1834 }
1835 
1836 
1837 // XXX check handling of failures
1838 int
netmap_attach_common(struct netmap_adapter * na)1839 netmap_attach_common(struct netmap_adapter *na)
1840 {
1841           struct ifnet *ifp = na->ifp;
1842 
1843           if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
1844                     D("%s: invalid rings tx %d rx %d",
1845                               ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
1846                     return EINVAL;
1847           }
1848           WNA(ifp) = na;
1849           NETMAP_SET_CAPABLE(ifp);
1850           if (na->nm_krings_create == NULL) {
1851                     na->nm_krings_create = netmap_hw_krings_create;
1852                     na->nm_krings_delete = netmap_krings_delete;
1853           }
1854           if (na->nm_notify == NULL)
1855                     na->nm_notify = netmap_notify;
1856           na->active_fds = 0;
1857 
1858           if (na->nm_mem == NULL)
1859                     na->nm_mem = &nm_mem;
1860           return 0;
1861 }
1862 
1863 
1864 void
netmap_detach_common(struct netmap_adapter * na)1865 netmap_detach_common(struct netmap_adapter *na)
1866 {
1867           if (na->ifp)
1868                     WNA(na->ifp) = NULL; /* XXX do we need this? */
1869 
1870           if (na->tx_rings) { /* XXX should not happen */
1871                     D("freeing leftover tx_rings");
1872                     na->nm_krings_delete(na);
1873           }
1874           if (na->na_flags & NAF_MEM_OWNER)
1875                     netmap_mem_private_delete(na->nm_mem);
1876           bzero(na, sizeof(*na));
1877           kfree(na, M_DEVBUF);
1878 }
1879 
1880 
1881 /*
1882  * Initialize a ``netmap_adapter`` object created by driver on attach.
1883  * We allocate a block of memory with room for a struct netmap_adapter
1884  * plus two sets of N+2 struct netmap_kring (where N is the number
1885  * of hardware rings):
1886  * krings 0..N-1    are for the hardware queues.
1887  * kring  N         is for the host stack queue
1888  * kring  N+1       is only used for the selinfo for all queues.
1889  * Return 0 on success, ENOMEM otherwise.
1890  *
1891  * By default the receive and transmit adapter ring counts are both initialized
1892  * to num_queues.  na->num_tx_rings can be set for cards with different tx/rx
1893  * setups.
1894  */
1895 int
netmap_attach(struct netmap_adapter * arg)1896 netmap_attach(struct netmap_adapter *arg)
1897 {
1898           struct netmap_hw_adapter *hwna = NULL;
1899           // XXX when is arg == NULL ?
1900           struct ifnet *ifp = arg ? arg->ifp : NULL;
1901 
1902           if (arg == NULL || ifp == NULL)
1903                     goto fail;
1904           hwna = kmalloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
1905           if (hwna == NULL)
1906                     goto fail;
1907           hwna->up = *arg;
1908           if (netmap_attach_common(&hwna->up)) {
1909                     kfree(hwna, M_DEVBUF);
1910                     goto fail;
1911           }
1912           netmap_adapter_get(&hwna->up);
1913 
1914           D("success for %s", NM_IFPNAME(ifp));
1915           return 0;
1916 
1917 fail:
1918           D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
1919           netmap_detach(ifp);
1920           return (hwna ? EINVAL : ENOMEM);
1921 }
1922 
1923 
1924 void
NM_DBG(netmap_adapter_get)1925 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
1926 {
1927           if (!na) {
1928                     return;
1929           }
1930 
1931           refcount_acquire(&na->na_refcount);
1932 }
1933 
1934 
1935 /* returns 1 iff the netmap_adapter is destroyed */
1936 int
NM_DBG(netmap_adapter_put)1937 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
1938 {
1939           if (!na)
1940                     return 1;
1941 
1942           if (!refcount_release(&na->na_refcount))
1943                     return 0;
1944 
1945           if (na->nm_dtor)
1946                     na->nm_dtor(na);
1947 
1948           netmap_detach_common(na);
1949 
1950           return 1;
1951 }
1952 
1953 
1954 int
netmap_hw_krings_create(struct netmap_adapter * na)1955 netmap_hw_krings_create(struct netmap_adapter *na)
1956 {
1957           return netmap_krings_create(na,
1958                     na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
1959 }
1960 
1961 
1962 
1963 /*
1964  * Free the allocated memory linked to the given ``netmap_adapter``
1965  * object.
1966  */
1967 void
netmap_detach(struct ifnet * ifp)1968 netmap_detach(struct ifnet *ifp)
1969 {
1970           struct netmap_adapter *na = NA(ifp);
1971 
1972           if (!na)
1973                     return;
1974 
1975           NMG_LOCK();
1976           netmap_disable_all_rings(ifp);
1977           netmap_adapter_put(na);
1978           na->ifp = NULL;
1979           netmap_enable_all_rings(ifp);
1980           NMG_UNLOCK();
1981 }
1982 
1983 
1984 /*
1985  * Intercept packets from the network stack and pass them
1986  * to netmap as incoming packets on the 'software' ring.
1987  * We rely on the OS to make sure that the ifp and na do not go
1988  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
1989  * In nm_register() or whenever there is a reinitialization,
1990  * we make sure to access the core lock and per-ring locks
1991  * so that IFCAP_NETMAP is visible here.
1992  */
1993 int
netmap_transmit(struct ifnet * ifp,struct mbuf * m)1994 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
1995 {
1996           struct netmap_adapter *na = NA(ifp);
1997           struct netmap_kring *kring;
1998           u_int i, len = MBUF_LEN(m);
1999           u_int error = EBUSY, lim;
2000           struct netmap_slot *slot;
2001 
2002           // XXX [Linux] we do not need this lock
2003           // if we follow the down/configure/up protocol -gl
2004           // mtx_lock(&na->core_lock);
2005           if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2006                     /* interface not in netmap mode anymore */
2007                     error = ENXIO;
2008                     goto done;
2009           }
2010 
2011           kring = &na->rx_rings[na->num_rx_rings];
2012           lim = kring->nkr_num_slots - 1;
2013           if (netmap_verbose & NM_VERB_HOST)
2014                     D("%s packet %d len %d from the stack", NM_IFPNAME(ifp),
2015                               kring->nr_hwcur + kring->nr_hwavail, len);
2016           // XXX reconsider long packets if we handle fragments
2017           if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2018                     D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2019                               len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2020                     goto done;
2021           }
2022           /* protect against other instances of netmap_transmit,
2023            * and userspace invocations of rxsync().
2024            */
2025           // XXX [Linux] there can be no other instances of netmap_transmit
2026           // on this same ring, but we still need this lock to protect
2027           // concurrent access from netmap_sw_to_nic() -gl
2028           lockmgr(&kring->q_lock, LK_EXCLUSIVE);
2029           if (kring->nr_hwavail >= lim) {
2030                     if (netmap_verbose)
2031                               D("stack ring %s full\n", NM_IFPNAME(ifp));
2032           } else {
2033                     /* compute the insert position */
2034                     i = nm_kr_rxpos(kring);
2035                     slot = &kring->ring->slot[i];
2036                     m_copydata(m, 0, (int)len, BDG_NMB(na, slot));
2037                     slot->len = len;
2038                     slot->flags = kring->nkr_slot_flags;
2039                     kring->nr_hwavail++;
2040                     if (netmap_verbose  & NM_VERB_HOST)
2041                               D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings);
2042                     na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2043                     error = 0;
2044           }
2045           lockmgr(&kring->q_lock, LK_RELEASE);
2046 
2047 done:
2048           // mtx_unlock(&na->core_lock);
2049 
2050           /* release the mbuf in either cases of success or failure. As an
2051            * alternative, put the mbuf in a free list and free the list
2052            * only when really necessary.
2053            */
2054           m_freem(m);
2055 
2056           return (error);
2057 }
2058 
2059 
2060 /*
2061  * netmap_reset() is called by the driver routines when reinitializing
2062  * a ring. The driver is in charge of locking to protect the kring.
2063  * If native netmap mode is not set just return NULL.
2064  */
2065 struct netmap_slot *
netmap_reset(struct netmap_adapter * na,enum txrx tx,u_int n,u_int new_cur)2066 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2067           u_int new_cur)
2068 {
2069           struct netmap_kring *kring;
2070           int new_hwofs, lim;
2071 
2072           if (na == NULL) {
2073                     D("NULL na, should not happen");
2074                     return NULL;        /* no netmap support here */
2075           }
2076           if (!(na->ifp->if_capenable & IFCAP_NETMAP) || nma_is_generic(na)) {
2077                     ND("interface not in netmap mode");
2078                     return NULL;        /* nothing to reinitialize */
2079           }
2080 
2081           /* XXX note- in the new scheme, we are not guaranteed to be
2082            * under lock (e.g. when called on a device reset).
2083            * In this case, we should set a flag and do not trust too
2084            * much the values. In practice: TODO
2085            * - set a RESET flag somewhere in the kring
2086            * - do the processing in a conservative way
2087            * - let the *sync() fixup at the end.
2088            */
2089           if (tx == NR_TX) {
2090                     if (n >= na->num_tx_rings)
2091                               return NULL;
2092                     kring = na->tx_rings + n;
2093                     new_hwofs = kring->nr_hwcur - new_cur;
2094           } else {
2095                     if (n >= na->num_rx_rings)
2096                               return NULL;
2097                     kring = na->rx_rings + n;
2098                     new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
2099           }
2100           lim = kring->nkr_num_slots - 1;
2101           if (new_hwofs > lim)
2102                     new_hwofs -= lim + 1;
2103 
2104           /* Always set the new offset value and realign the ring. */
2105           D("%s hwofs %d -> %d, hwavail %d -> %d",
2106                     tx == NR_TX ? "TX" : "RX",
2107                     kring->nkr_hwofs, new_hwofs,
2108                     kring->nr_hwavail,
2109                     tx == NR_TX ? lim : kring->nr_hwavail);
2110           kring->nkr_hwofs = new_hwofs;
2111           if (tx == NR_TX)
2112                     kring->nr_hwavail = lim;
2113           kring->nr_hwreserved = 0;
2114 
2115           /*
2116            * Wakeup on the individual and global selwait
2117            * We do the wakeup here, but the ring is not yet reconfigured.
2118            * However, we are under lock so there are no races.
2119            */
2120           na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
2121           return kring->ring->slot;
2122 }
2123 
2124 
2125 /*
2126  * Default functions to handle rx/tx interrupts from a physical device.
2127  * "work_done" is non-null on the RX path, NULL for the TX path.
2128  * "generic" is 0 when we are called by a device driver, and 1 when we
2129  * are called by the generic netmap adapter layer.
2130  * We rely on the OS to make sure that there is only one active
2131  * instance per queue, and that there is appropriate locking.
2132  *
2133  * If the card is not in netmap mode, simply return 0,
2134  * so that the caller proceeds with regular processing.
2135  *
2136  * We return 0 also when the card is in netmap mode but the current
2137  * netmap adapter is the generic one, because this function will be
2138  * called by the generic layer.
2139  *
2140  * If the card is connected to a netmap file descriptor,
2141  * do a selwakeup on the individual queue, plus one on the global one
2142  * if needed (multiqueue card _and_ there are multiqueue listeners),
2143  * and return 1.
2144  *
2145  * Finally, if called on rx from an interface connected to a switch,
2146  * calls the proper forwarding routine, and return 1.
2147  */
2148 int
netmap_common_irq(struct ifnet * ifp,u_int q,u_int * work_done)2149 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2150 {
2151           struct netmap_adapter *na = NA(ifp);
2152           struct netmap_kring *kring;
2153 
2154           q &= NETMAP_RING_MASK;
2155 
2156           if (netmap_verbose) {
2157                   RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2158           }
2159 
2160           if (work_done) { /* RX path */
2161                     if (q >= na->num_rx_rings)
2162                               return 0; // not a physical queue
2163                     kring = na->rx_rings + q;
2164                     kring->nr_kflags |= NKR_PENDINTR;       // XXX atomic ?
2165                     na->nm_notify(na, q, NR_RX,
2166                               (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2167                     *work_done = 1; /* do not fire napi again */
2168           } else { /* TX path */
2169                     if (q >= na->num_tx_rings)
2170                               return 0; // not a physical queue
2171                     kring = na->tx_rings + q;
2172                     na->nm_notify(na, q, NR_TX,
2173                               (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2174           }
2175           return 1;
2176 }
2177 
2178 /*
2179  * Default functions to handle rx/tx interrupts from a physical device.
2180  * "work_done" is non-null on the RX path, NULL for the TX path.
2181  * "generic" is 0 when we are called by a device driver, and 1 when we
2182  * are called by the generic netmap adapter layer.
2183  * We rely on the OS to make sure that there is only one active
2184  * instance per queue, and that there is appropriate locking.
2185  *
2186  * If the card is not in netmap mode, simply return 0,
2187  * so that the caller proceeds with regular processing.
2188  *
2189  * If the card is connected to a netmap file descriptor,
2190  * do a selwakeup on the individual queue, plus one on the global one
2191  * if needed (multiqueue card _and_ there are multiqueue listeners),
2192  * and return 1.
2193  *
2194  * Finally, if called on rx from an interface connected to a switch,
2195  * calls the proper forwarding routine, and return 1.
2196  */
2197 int
netmap_rx_irq(struct ifnet * ifp,u_int q,u_int * work_done)2198 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2199 {
2200           // XXX could we check NAF_NATIVE_ON ?
2201           if (!(ifp->if_capenable & IFCAP_NETMAP))
2202                     return 0;
2203 
2204           if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2205                     ND("use regular interrupt");
2206                     return 0;
2207           }
2208 
2209           return netmap_common_irq(ifp, q, work_done);
2210 }
2211 
2212 
2213 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2214 
2215 
2216 /*
2217  * Module loader.
2218  *
2219  * Create the /dev/netmap device and initialize all global
2220  * variables.
2221  *
2222  * Return 0 on success, errno on failure.
2223  */
2224 int
netmap_init(void)2225 netmap_init(void)
2226 {
2227           int error;
2228 
2229           NMG_LOCK_INIT();
2230 
2231           error = netmap_mem_init();
2232           if (error != 0) {
2233                     kprintf("netmap: unable to initialize the memory allocator.\n");
2234                     return (error);
2235           }
2236           kprintf("netmap: loaded module\n");
2237           netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2238                                     "netmap");
2239 
2240           netmap_init_bridges();
2241           return (error);
2242 }
2243 
2244 
2245 /*
2246  * Module unloader.
2247  *
2248  * Free all the memory, and destroy the ``/dev/netmap`` device.
2249  */
2250 void
netmap_fini(void)2251 netmap_fini(void)
2252 {
2253           destroy_dev(netmap_dev);
2254           netmap_mem_fini();
2255           NMG_LOCK_DESTROY();
2256           kprintf("netmap: unloaded module.\n");
2257 }
2258