xref: /dragonfly/sys/net/netmap/netmap.h (revision f933b737dabc806a2f1680f0afea2fb42a345b92)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28  * $FreeBSD: head/sys/net/netmap.h 251139 2013-05-30 14:07:14Z luigi $
29  *
30  * Definitions of constants and the structures used by the netmap
31  * framework, for the part visible to both kernel and userspace.
32  * Detailed info on netmap is available with "man netmap" or at
33  *
34  *        http://info.iet.unipi.it/~luigi/netmap/
35  *
36  * This API is also used to communicate with the VALE software switch
37  */
38 
39 #ifndef _NET_NETMAP_H_
40 #define _NET_NETMAP_H_
41 
42 /*
43  * --- Netmap data structures ---
44  *
45  * The userspace data structures used by netmap are shown below.
46  * They are allocated by the kernel and mmap()ed by userspace threads.
47  * Pointers are implemented as memory offsets or indexes,
48  * so that they can be easily dereferenced in kernel and userspace.
49 
50    KERNEL (opaque, obviously)
51 
52   ====================================================================
53                                          |
54    USERSPACE                             |      struct netmap_ring
55                                          +---->+--------------+
56                                              / | cur          |
57    struct netmap_if (nifp, 1 per fd)        /  | avail        |
58     +---------------+                      /   | buf_ofs      |
59     | ni_tx_rings   |                     /    +==============+
60     | ni_rx_rings   |                    /     | buf_idx, len | slot[0]
61     |               |                   /      | flags, ptr   |
62     |               |                  /       +--------------+
63     +===============+                 /        | buf_idx, len | slot[1]
64     | txring_ofs[0] | (rel.to nifp)--'         | flags, ptr   |
65     | txring_ofs[1] |                          +--------------+
66   (ni_tx_rings+1 entries)                     (num_slots entries)
67     | txring_ofs[t] |                          | buf_idx, len | slot[n-1]
68     +---------------+                          | flags, ptr   |
69     | rxring_ofs[0] |                          +--------------+
70     | rxring_ofs[1] |
71   (ni_rx_rings+1 entries)
72     | rxring_ofs[r] |
73     +---------------+
74 
75  * For each "interface" (NIC, host stack, VALE switch port) attached to a
76  * file descriptor, the mmap()ed region contains a (logically readonly)
77  * struct netmap_if pointing to struct netmap_ring's.
78  * There is one netmap_ring per physical NIC ring, plus one tx/rx ring
79  * pair attached to the host stack (this pair is unused for VALE ports).
80  *
81  * All physical/host stack ports share the same memory region,
82  * so that zero-copy can be implemented between them.
83  * VALE switch ports instead have separate memory regions.
84  *
85  * The netmap_ring is the userspace-visible replica of the NIC ring.
86  * Each slot has the index of a buffer (MTU-sized and residing in the
87  * mmapped region), its length and some flags. An extra 64-bit pointer
88  * is provided for user-supplied buffers in the tx path.
89  *
90  * In user space, the buffer address is computed as
91  *        (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE
92  */
93 
94 /*
95  * struct netmap_slot is a buffer descriptor
96  *
97  * buf_idx          the index of the buffer associated to the slot.
98  * len              the length of the payload
99  * flags  control operation on the slot, as defined below
100  *
101  * NS_BUF_CHANGED   must be set whenever userspace wants
102  *                  to change buf_idx (it might be necessary to
103  *                  reprogram the NIC)
104  *
105  * NS_REPORT        must be set if we want the NIC to generate an interrupt
106  *                  when this slot is used. Leaving it to 0 improves
107  *                  performance.
108  *
109  * NS_FORWARD       if set on a receive ring, and the device is in
110  *                  transparent mode, buffers released with the flag set
111  *                  will be forwarded to the 'other' side (host stack
112  *                  or NIC, respectively) on the next select() or ioctl()
113  *
114  * NS_NO_LEARN      on a VALE switch, do not 'learn' the source port for
115  *                  this packet.
116  *
117  * NS_INDIRECT      (tx rings only) data is in a userspace buffer pointed
118  *                  by the ptr field in the slot.
119  *
120  * NS_MOREFRAG      Part of a multi-segment frame. The last (or only)
121  *                  segment must not have this flag.
122  *                  Only supported on VALE ports.
123  *
124  * NS_PORT_MASK     the high 8 bits of the flag, if not zero, indicate the
125  *                  destination port for the VALE switch, overriding
126  *                  the lookup table.
127  */
128 
129 struct netmap_slot {
130           uint32_t buf_idx;   /* buffer index */
131           uint16_t len;                 /* packet length */
132           uint16_t flags;               /* buf changed, etc. */
133 #define   NS_BUF_CHANGED      0x0001    /* buf_idx changed */
134 #define   NS_REPORT 0x0002    /* ask the hardware to report results
135                                          * e.g. by generating an interrupt
136                                          */
137 #define   NS_FORWARD          0x0004    /* pass packet to the other endpoint
138                                          * (host stack or device)
139                                          */
140 #define   NS_NO_LEARN         0x0008
141 #define   NS_INDIRECT         0x0010
142 #define   NS_MOREFRAG         0x0020
143 #define   NS_PORT_SHIFT       8
144 #define   NS_PORT_MASK        (0xff << NS_PORT_SHIFT)
145                                         /*
146                                          * in rx rings, the high 8 bits
147                                          *  are the number of fragments.
148                                          */
149 #define   NS_RFRAGS(_slot)    ( ((_slot)->flags >> 8) & 0xff)
150           uint64_t  ptr;      /* pointer for indirect buffers */
151 };
152 
153 /*
154  * struct netmap_ring
155  *
156  * Netmap representation of a TX or RX ring (also known as "queue").
157  * This is a queue implemented as a fixed-size circular array.
158  * At the software level, two fields are important: avail and cur.
159  *
160  * In TX rings:
161  *
162  *        avail     tells how many slots are available for transmission.
163  *                  It is updated by the kernel in each netmap system call.
164  *                  It MUST BE decremented by the user when it
165  *                  adds a new packet to send.
166  *
167  *        cur       indicates the slot to use for the next packet
168  *                  to send (i.e. the "tail" of the queue).
169  *                  It MUST BE incremented by the user before
170  *                  netmap system calls to reflect the number of newly
171  *                  sent packets.
172  *                  It is checked by the kernel on netmap system calls
173  *                  (normally unmodified by the kernel unless invalid).
174  *
175  * In RX rings:
176  *
177  *        avail     is the number of packets available (possibly 0).
178  *                  It is updated by the kernel in each netmap system call.
179  *                  It MUST BE decremented by the user when it
180  *                  consumes a packet.
181  *
182  *        cur       indicates the first slot that contains a packet not
183  *                  yet processed (the "head" of the queue).
184  *                  It MUST BE incremented by the user when it consumes
185  *                  a packet.
186  *
187  *        reserved  indicates the number of buffers before 'cur'
188  *                  that the user has not released yet. Normally 0,
189  *                  it MUST BE incremented by the user when it
190  *                  does not return the buffer immediately, and decremented
191  *                  when the buffer is finally freed.
192  *
193  *
194  * DATA OWNERSHIP/LOCKING:
195  *        The netmap_ring, all slots, and buffers in the range
196  *        [reserved-cur , cur+avail[ are owned by the user program,
197  *        and the kernel only touches them in the same thread context
198  *        during a system call.
199  *        Other buffers are reserved for use by the NIC's DMA engines.
200  *
201  * FLAGS
202  *        NR_TIMESTAMP        updates the 'ts' field on each syscall. This is
203  *                            a global timestamp for all packets.
204  *        NR_RX_TSTMP         if set, the last 64 byte in each buffer will
205  *                            contain a timestamp for the frame supplied by
206  *                            the hardware (if supported)
207  *        NR_FORWARD          if set, the NS_FORWARD flag in each slot of the
208  *                            RX ring is checked, and if set the packet is
209  *                            passed to the other side (host stack or device,
210  *                            respectively). This permits bpf-like behaviour
211  *                            or transparency for selected packets.
212  */
213 struct netmap_ring {
214           /*
215            * buf_ofs is meant to be used through macros.
216            * It contains the offset of the buffer region from this
217            * descriptor.
218            */
219           const ssize_t       buf_ofs;
220           const uint32_t      num_slots;          /* number of slots in the ring. */
221           uint32_t  avail;              /* number of usable slots */
222           uint32_t        cur;                    /* 'current' r/w position */
223           uint32_t  reserved; /* not refilled before current */
224 
225           const uint16_t      nr_buf_size;
226           uint16_t  flags;
227 #define   NR_TIMESTAMP        0x0002              /* set timestamp on *sync() */
228 #define   NR_FORWARD          0x0004              /* enable NS_FORWARD for ring */
229 #define   NR_RX_TSTMP         0x0008              /* set rx timestamp in slots */
230 
231           struct timeval      ts;                 /* time of last *sync() */
232 
233           /* the slots follow. This struct has variable size */
234           struct netmap_slot slot[0];   /* array of slots. */
235 };
236 
237 
238 /*
239  * Netmap representation of an interface and its queue(s).
240  * This is initialized by the kernel when binding a file
241  * descriptor to a port, and should be considered as readonly
242  * by user programs. The kernel never uses it.
243  *
244  * There is one netmap_if for each file descriptor on which we want
245  * to select/poll.
246  * select/poll operates on one or all pairs depending on the value of
247  * nmr_queueid passed on the ioctl.
248  */
249 struct netmap_if {
250           char                ni_name[IFNAMSIZ]; /* name of the interface. */
251           const uint32_t      ni_version;         /* API version, currently unused */
252           const uint32_t      ni_flags; /* properties */
253 #define   NI_PRIV_MEM         0x1                 /* private memory region */
254 
255           const uint32_t      ni_rx_rings;        /* number of rx rings */
256           const uint32_t      ni_tx_rings;        /* number of tx rings */
257           /*
258            * The following array contains the offset of each netmap ring
259            * from this structure. The first ni_tx_rings+1 entries refer
260            * to the tx rings, the next ni_rx_rings+1 refer to the rx rings
261            * (the last entry in each block refers to the host stack rings).
262            * The area is filled up by the kernel on NIOCREGIF,
263            * and then only read by userspace code.
264            */
265           const ssize_t       ring_ofs[0];
266 };
267 
268 #ifndef NIOCREGIF
269 /*
270  * ioctl names and related fields
271  *
272  * NIOCGINFO takes a struct ifreq, the interface name is the input,
273  *        the outputs are number of queues and number of descriptor
274  *        for each queue (useful to set number of threads etc.).
275  *        The info returned is only advisory and may change before
276  *        the interface is bound to a file descriptor.
277  *
278  * NIOCREGIF takes an interface name within a struct ifreq,
279  *        and activates netmap mode on the interface (if possible).
280  *
281  *   nr_name        is the name of the interface
282  *
283  *   nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings
284  *        indicate the configuration of the port on return.
285  *
286  *        On input, non-zero values for nr_tx_rings, nr_tx_slots and the
287  *        rx counterparts may be used to reconfigure the port according
288  *        to the requested values, but this is not guaranteed.
289  *        The actual values are returned on completion of the ioctl().
290  *
291  *   nr_ringid
292  *        indicates how rings should be bound to the file descriptors.
293  *        The default (0) means all physical rings of a NIC are bound.
294  *        NETMAP_HW_RING plus a ring number lets you bind just
295  *        a single ring pair.
296  *        NETMAP_SW_RING binds only the host tx/rx rings
297  *        NETMAP_NO_TX_POLL prevents select()/poll() from pushing
298  *        out packets on the tx ring unless POLLOUT is specified.
299  *
300  *        NETMAP_PRIV_MEM is a return value used to indicate that
301  *        this ring is in a private memory region hence buffer
302  *        swapping cannot be used
303  *
304  *   nr_cmd         is used to configure NICs attached to a VALE switch,
305  *        or to dump the configuration of a VALE switch.
306  *
307  *        nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname
308  *        attaches the NIC to the switch, with nr_ringid specifying
309  *        which rings to use
310  *
311  *        nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname
312  *        disconnects a previously attached NIC
313  *
314  *        nr_cmd = NETMAP_BDG_LIST is used to list the configuration
315  *        of VALE switches, with additional arguments.
316  *
317  * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
318  *        whose identity is set in NIOCREGIF through nr_ringid
319  *
320  * NETMAP_API is the API version.
321  */
322 
323 /*
324  * struct nmreq overlays a struct ifreq
325  */
326 struct nmreq {
327           char                nr_name[IFNAMSIZ];
328           uint32_t  nr_version;         /* API version */
329 #define   NETMAP_API          5                   /* current version */
330           uint32_t  nr_offset;          /* nifp offset in the shared region */
331           uint32_t  nr_memsize;         /* size of the shared region */
332           uint32_t  nr_tx_slots;        /* slots in tx rings */
333           uint32_t  nr_rx_slots;        /* slots in rx rings */
334           uint16_t  nr_tx_rings;        /* number of tx rings */
335           uint16_t  nr_rx_rings;        /* number of rx rings */
336           uint16_t  nr_ringid;          /* ring(s) we care about */
337 #define NETMAP_PRIV_MEM       0x8000              /* rings use private memory */
338 #define NETMAP_HW_RING        0x4000              /* low bits indicate one hw ring */
339 #define NETMAP_SW_RING        0x2000              /* process the sw ring */
340 #define NETMAP_NO_TX_POLL     0x1000    /* no automatic txsync on poll */
341 #define NETMAP_RING_MASK 0xfff                    /* the ring number */
342           uint16_t  nr_cmd;
343 #define NETMAP_BDG_ATTACH     1         /* attach the NIC */
344 #define NETMAP_BDG_DETACH     2         /* detach the NIC */
345 #define NETMAP_BDG_LOOKUP_REG 3         /* register lookup function */
346 #define NETMAP_BDG_LIST                 4         /* get bridge's info */
347           uint16_t  nr_arg1;
348 #define NETMAP_BDG_HOST                 1         /* attach the host stack on ATTACH */
349           uint16_t  nr_arg2;
350           uint32_t  spare2[3];
351 };
352 
353 /*
354  * FreeBSD uses the size value embedded in the _IOWR to determine
355  * how much to copy in/out. So we need it to match the actual
356  * data structure we pass. We put some spares in the structure
357  * to ease compatibility with other versions
358  */
359 #define NIOCGINFO   _IOWR('i', 145, struct nmreq) /* return IF info */
360 #define NIOCREGIF   _IOWR('i', 146, struct nmreq) /* interface register */
361 #define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */
362 #define NIOCTXSYNC  _IO('i', 148) /* sync tx queues */
363 #define NIOCRXSYNC  _IO('i', 149) /* sync rx queues */
364 #endif /* !NIOCREGIF */
365 
366 #endif /* _NET_NETMAP_H_ */
367