xref: /dragonfly/sys/net/dummynet3/ip_dummynet3.h (revision 6a03354eaf5595cb09622704ea7d2ef2794ccffb)
1 /*
2  * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
3  * Portions Copyright (c) 2000 Akamba Corp.
4  * All rights reserved
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD: src/sys/netinet/ip_dummynet.h,v 1.10.2.9 2003/05/13 09:31:06 maxim Exp $
28  * $DragonFly: src/sys/net/dummynet/ip_dummynet.h,v 1.19 2008/09/20 04:36:51 sephe Exp $
29  */
30 
31 #ifndef _IP_DUMMYNET3_H_
32 #define _IP_DUMMYNET3_H_
33 
34 #ifndef _IP_DUMMYNET_H
35 
36 #define MODULE_DUMMYNET_ID    2
37 #define MODULE_DUMMYNET_NAME  "dummynet"
38 
39 
40 #ifdef _KERNEL
41 //placeholder for kernel
42 #endif
43 
44 enum ipfw_dummynet_opcodes {
45           O_DUMMYNET_PIPE,
46           O_DUMMYNET_QUEUE,
47 };
48 
49 /*
50  * We start with a heap, which is used in the scheduler to decide when to
51  * transmit packets etc.
52  *
53  * The key for the heap is used for two different values:
54  *
55  * 1. Timer ticks- max 10K/second, so 32 bits are enough;
56  *
57  * 2. Virtual times.  These increase in steps of len/x, where len is the
58  *        packet length, and x is either the weight of the flow, or the sum
59  *        of all weights.
60  *        If we limit to max 1000 flows and a max weight of 100, then x needs
61  *        17 bits.  The packet size is 16 bits, so we can easily overflow if
62  *        we do not allow errors.
63  *
64  * So we use a key "dn_key" which is 64 bits.
65  *
66  * MY_M is used as a shift count when doing fixed point arithmetic
67  * (a better name would be useful...).
68  */
69 typedef uint64_t    dn_key;   /* sorting key */
70 
71 /*
72  * Number of left shift to obtain a larger precision
73  *
74  * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
75  * virtual time wraps every 15 days.
76  */
77 #define MY_M                  16
78 
79 #ifdef _KERNEL
80 
81 /*
82  * A heap entry is made of a key and a pointer to the actual object stored
83  * in the heap.
84  *
85  * The heap is an array of dn_heap_entry entries, dynamically allocated.
86  * Current size is "size", with "elements" actually in use.
87  *
88  * The heap normally supports only ordered insert and extract from the top.
89  * If we want to extract an object from the middle of the heap, we have to
90  * know where the object itself is located in the heap (or we need to scan
91  * the whole array).  To this purpose, an object has a field (int) which
92  * contains the index of the object itself into the heap.  When the object
93  * is moved, the field must also be updated.  The offset of the index in the
94  * object is stored in the 'offset' field in the heap descriptor.  The
95  * assumption is that this offset is non-zero if we want to support extract
96  * from the middle.
97  */
98 struct dn_heap_entry {
99           dn_key key;         /* sorting key.  Topmost element is smallest one */
100           void *object;       /* object pointer */
101 };
102 
103 struct dn_heap {
104           int size;
105           int elements;
106           int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
107           struct dn_heap_entry *p;      /* really an array of "size" entries */
108 };
109 
110 struct dn_flow_id {
111           uint16_t fid_type;  /* ETHERTYPE_ */
112           uint16_t pad;
113           union {
114                     struct {
115                               uint32_t dst_ip;
116                               uint32_t src_ip;
117                               uint16_t dst_port;
118                               uint16_t src_port;
119                               uint8_t proto;
120                               uint8_t flags;
121                     } inet;
122           } fid_u;
123 #define fid_dst_ip  fid_u.inet.dst_ip
124 #define fid_src_ip  fid_u.inet.src_ip
125 #define fid_dst_port          fid_u.inet.dst_port
126 #define fid_src_port          fid_u.inet.src_port
127 #define fid_proto   fid_u.inet.proto
128 #define fid_flags   fid_u.inet.flags
129 };
130 
131 typedef void        (*ip_dn_unref_priv_t)(void *);
132 struct lwkt_port;
133 
134 /*
135  * struct dn_pkt identifies a packet in the dummynet queue, but is also used
136  * to tag packets passed back to the various destinations (ip_input(),
137  * ip_output() and so on).
138  *
139  * It is a tag (PACKET_TAG_DUMMYNET) associated with the actual mbuf.
140  */
141 struct dn_pkt {
142           struct mbuf *dn_m;
143           TAILQ_ENTRY(dn_pkt) dn_next;
144 
145           void *dn_priv;
146           ip_dn_unref_priv_t dn_unref_priv;
147 
148           uint32_t dn_flags;            /* action when packet comes out. */
149 #define DN_FLAGS_IS_PIPE      0x10
150 #define DN_FLAGS_DIR_MASK     0x0f
151 #define DN_TO_IP_OUT                    1
152 #define DN_TO_IP_IN           2
153 #define DN_TO_ETH_DEMUX                 4
154 #define DN_TO_ETH_OUT                   5
155 #define DN_TO_MAX             6
156 
157           dn_key output_time;           /* when the pkt is due for delivery */
158           struct ifnet *ifp;            /* interface, for ip_output */
159           struct sockaddr_in *dn_dst;
160           struct route ro;              /* route, for ip_output. MUST COPY */
161           int flags;                              /* flags, for ip_output (IPv6 ?) */
162 
163           u_short pipe_nr;              /* pipe/flow_set number */
164           u_short pad;
165 
166           struct dn_flow_id id;         /* flow id */
167           int cpuid;                              /* target cpuid, for assertion */
168           struct lwkt_port *msgport;    /* target msgport */
169 };
170 TAILQ_HEAD(dn_pkt_queue, dn_pkt);
171 
172 /*
173  * Overall structure of dummynet (with WF2Q+):
174  *
175  * In dummynet, packets are selected with the firewall rules, and passed to
176  * two different objects: PIPE or QUEUE.
177  *
178  * A QUEUE is just a queue with configurable size and queue management policy.
179  * It is also associated with a mask (to discriminate among different flows),
180  * a weight (used to give different shares of the bandwidth to different flows)
181  * and a "pipe", which essentially supplies the transmit clock for all queues
182  * associated with that pipe.
183  *
184  * A PIPE emulates a fixed-bandwidth link, whose bandwidth is configurable.
185  * The "clock" for a pipe comes from an internal timer.  A pipe is also
186  * associated with one (or more, if masks are used) queue, where all packets
187  * for that pipe are stored.
188  *
189  * The bandwidth available on the pipe is shared by the queues associated with
190  * that pipe (only one in case the packet is sent to a PIPE) according to the
191  * WF2Q+ scheduling algorithm and the configured weights.
192  *
193  * In general, incoming packets are stored in the appropriate queue, which is
194  * then placed into one of a few heaps managed by a scheduler to decide when
195  * the packet should be extracted.  The scheduler (a function called dummynet())
196  * is run at every timer tick, and grabs queues from the head of the heaps when
197  * they are ready for processing.
198  *
199  * There are three data structures definining a pipe and associated queues:
200  *
201  *  + dn_pipe, which contains the main configuration parameters related to
202  *        delay and bandwidth;
203  *  + dn_flow_set, which contains WF2Q+ configuration, flow masks, plr and
204  *        RED configuration;
205  *  + dn_flow_queue, which is the per-flow queue (containing the packets)
206  *
207  * Multiple dn_flow_set can be linked to the same pipe, and multiple
208  * dn_flow_queue can be linked to the same dn_flow_set.
209  * All data structures are linked in a linear list which is used for
210  * housekeeping purposes.
211  *
212  * During configuration, we create and initialize the dn_flow_set and dn_pipe
213  * structures (a dn_pipe also contains a dn_flow_set).
214  *
215  * At runtime: packets are sent to the appropriate dn_flow_set (either WFQ
216  * ones, or the one embedded in the dn_pipe for fixed-rate flows), which in
217  * turn dispatches them to the appropriate dn_flow_queue (created dynamically
218  * according to the masks).
219  *
220  * The transmit clock for fixed rate flows (ready_event()) selects the
221  * dn_flow_queue to be used to transmit the next packet. For WF2Q,
222  * wfq_ready_event() extract a pipe which in turn selects the right flow using
223  * a number of heaps defined into the pipe itself.
224  */
225 
226 /*
227  * Per flow queue.  This contains the flow identifier, the queue of packets,
228  * counters, and parameters used to support both RED and WF2Q+.
229  *
230  * A dn_flow_queue is created and initialized whenever a packet for a new
231  * flow arrives.
232  */
233 struct dn_flow_queue {
234           struct dn_flow_id id;
235           LIST_ENTRY(dn_flow_queue) q_link;
236 
237           struct dn_pkt_queue queue;    /* queue of packets */
238           u_int len;
239           u_int len_bytes;
240           u_long numbytes;              /* credit for transmission (dynamic queues) */
241 
242           uint64_t tot_pkts;            /* statistics counters */
243           uint64_t tot_bytes;
244           uint32_t drops;
245 
246           int hash_slot;                /* debugging/diagnostic */
247 
248           /* RED parameters */
249           int avg;                      /* average queue length est. (scaled) */
250           int count;                              /* arrivals since last RED drop */
251           int random;                             /* random value (scaled) */
252           uint32_t q_time;              /* start of queue idle time */
253 
254           /* WF2Q+ support */
255           struct dn_flow_set *fs;       /* parent flow set */
256           int heap_pos;                 /* position (index) of struct in heap */
257           dn_key sched_time;            /* current time when queue enters ready_heap */
258 
259           dn_key S, F;                  /* start time, finish time */
260           /*
261            * Setting F < S means the timestamp is invalid. We only need
262            * to test this when the queue is empty.
263            */
264 };
265 LIST_HEAD(dn_flowqueue_head, dn_flow_queue);
266 
267 /*
268  * flow_set descriptor.  Contains the "template" parameters for the queue
269  * configuration, and pointers to the hash table of dn_flow_queue's.
270  *
271  * The hash table is an array of lists -- we identify the slot by hashing
272  * the flow-id, then scan the list looking for a match.
273  * The size of the hash table (buckets) is configurable on a per-queue basis.
274  *
275  * A dn_flow_set is created whenever a new queue or pipe is created (in the
276  * latter case, the structure is located inside the struct dn_pipe).
277  */
278 struct dn_flow_set {
279           u_short fs_nr;                /* flow_set number */
280           u_short flags_fs;             /* see 'Flow set flags' */
281 
282           LIST_ENTRY(dn_flow_set) fs_link;
283 
284           struct dn_pipe *pipe;         /* pointer to parent pipe */
285           u_short parent_nr;            /* parent pipe#, 0 if local to a pipe */
286 
287           int weight;                             /* WFQ queue weight */
288           int qsize;                              /* queue size in slots or bytes */
289           int plr;                      /* pkt loss rate (2^31-1 means 100%) */
290 
291           struct dn_flow_id flow_mask;
292 
293           /* hash table of queues onto this flow_set */
294           int rq_size;                  /* number of slots */
295           int rq_elements;              /* active elements */
296           struct dn_flowqueue_head *rq;/* array of rq_size entries */
297 
298           uint32_t last_expired;        /* do not expire too frequently */
299           int backlogged;               /* #active queues for this flowset */
300 
301           /* RED parameters */
302           int w_q;                      /* queue weight (scaled) */
303           int max_th;                             /* maximum threshold for queue (scaled) */
304           int min_th;                             /* minimum threshold for queue (scaled) */
305           int max_p;                              /* maximum value for p_b (scaled) */
306           u_int c_1;                              /* max_p/(max_th-min_th) (scaled) */
307           u_int c_2;                              /* max_p*min_th/(max_th-min_th) (scaled) */
308           u_int c_3;                              /* for GRED, (1-max_p)/max_th (scaled) */
309           u_int c_4;                              /* for GRED, 1 - 2*max_p (scaled) */
310           u_int *w_q_lookup;            /* lookup table for computing (1-w_q)^t */
311           u_int lookup_depth;           /* depth of lookup table */
312           int lookup_step;              /* granularity inside the lookup table */
313           int lookup_weight;            /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
314           int avg_pkt_size;             /* medium packet size */
315           int max_pkt_size;             /* max packet size */
316 };
317 LIST_HEAD(dn_flowset_head, dn_flow_set);
318 
319 /*
320  * Pipe descriptor. Contains global parameters, delay-line queue, and the
321  * flow_set used for fixed-rate queues.
322  *
323  * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
324  *  + not_eligible_heap, for queues whose start time is higher than the
325  *        virtual time. Sorted by start time.
326  *  + scheduler_heap, for queues eligible for scheduling.  Sorted by finish
327  *        time.
328  *  + idle_heap, all flows that are idle and can be removed.  We do that on
329  *        each tick so we do not slow down too much operations during forwarding.
330  */
331 struct dn_pipe {              /* a pipe */
332           int pipe_nr;                  /* number */
333           int bandwidth;                /* really, bytes/tick. */
334           int delay;                              /* really, ticks */
335 
336           struct dn_pkt_queue p_queue;/* packets in delay line */
337           LIST_ENTRY(dn_pipe) p_link;
338 
339           /* WF2Q+ */
340           struct dn_heap scheduler_heap; /* top extract - key Finish time*/
341           struct dn_heap not_eligible_heap; /* top extract- key Start time */
342           struct dn_heap idle_heap;     /* random extract - key Start=Finish time */
343 
344           dn_key V;                     /* virtual time */
345           int sum;                      /* sum of weights of all active sessions */
346           int numbytes;                 /* bits I can transmit (more or less). */
347 
348           dn_key sched_time;            /* time pipe was scheduled in ready_heap */
349 
350           struct dn_flow_set fs;        /* used with fixed-rate flows */
351 };
352 LIST_HEAD(dn_pipe_head, dn_pipe);
353 
354 struct dn_sopt {
355           int       dn_sopt_name;
356           void      *dn_sopt_arg;
357           size_t    dn_sopt_arglen;
358 };
359 
360 typedef int         ip_dn_ctl_t(struct dn_sopt *);
361 typedef int         ip_dn_io_t(struct mbuf *);
362 
363 extern ip_dn_ctl_t  *ip_dn_ctl_ptr;
364 extern ip_dn_io_t   *ip_dn_io_ptr;
365 
366 void      ip_dn_queue(struct mbuf *);
367 void      ip_dn_packet_free(struct dn_pkt *);
368 void      ip_dn_packet_redispatch(struct dn_pkt *);
369 int       ip_dn_sockopt(struct sockopt *);
370 
371 #define   DUMMYNET_LOADED     (ip_dn_io_ptr != NULL)
372 
373 #endif    /* _KERNEL */
374 
375 struct dn_ioc_flowid {
376           uint16_t type;      /* ETHERTYPE_ */
377           uint16_t pad;
378           union {
379                     struct {
380                               uint32_t dst_ip;
381                               uint32_t src_ip;
382                               uint16_t dst_port;
383                               uint16_t src_port;
384                               uint8_t proto;
385                               uint8_t flags;
386                     } ip;
387                     uint8_t pad[64];
388           } u;
389 };
390 
391 struct dn_ioc_flowqueue {
392           u_int len;
393           u_int len_bytes;
394 
395           uint64_t tot_pkts;
396           uint64_t tot_bytes;
397           uint32_t drops;
398 
399           int hash_slot;                /* debugging/diagnostic */
400           dn_key S;                     /* virtual start time */
401           dn_key F;                     /* virtual finish time */
402 
403           struct dn_ioc_flowid id;
404           uint8_t reserved[16];
405 };
406 
407 struct dn_ioc_flowset {
408           u_short fs_type;              /* DN_IS_{QUEUE,PIPE}, MUST be first */
409 
410           u_short fs_nr;                /* flow_set number */
411           u_short flags_fs;             /* see 'Flow set flags' */
412           u_short parent_nr;            /* parent pipe#, 0 if local to a pipe */
413 
414           int weight;                             /* WFQ queue weight */
415           int qsize;                              /* queue size in slots or bytes */
416           int plr;                      /* pkt loss rate (2^31-1 means 100%) */
417 
418           /* Hash table information */
419           int rq_size;                  /* number of slots */
420           int rq_elements;              /* active elements */
421 
422           /* RED parameters */
423           int w_q;                      /* queue weight (scaled) */
424           int max_th;                             /* maximum threshold for queue (scaled) */
425           int min_th;                             /* minimum threshold for queue (scaled) */
426           int max_p;                              /* maximum value for p_b (scaled) */
427           int lookup_step;              /* granularity inside the lookup table */
428           int lookup_weight;            /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
429 
430           struct dn_ioc_flowid flow_mask;
431           uint8_t reserved[16];
432 };
433 
434 struct dn_ioc_pipe {
435           struct dn_ioc_flowset fs;     /* MUST be first */
436 
437           int pipe_nr;                  /* pipe number */
438           int bandwidth;                /* bit/second */
439           int delay;                              /* milliseconds */
440 
441           dn_key V;                     /* virtual time */
442 
443           uint8_t reserved[16];
444 };
445 
446 /*
447  * Flow set flags
448  */
449 #define DN_HAVE_FLOW_MASK     0x0001
450 #define DN_IS_RED             0x0002
451 #define DN_IS_GENTLE_RED      0x0004
452 #define DN_QSIZE_IS_BYTES     0x0008    /* queue size is measured in bytes */
453 #define DN_NOERROR            0x0010    /* do not report ENOBUFS on drops */
454 #define DN_IS_PIPE            0x4000
455 #define DN_IS_QUEUE           0x8000
456 
457 /*
458  * Macros for RED
459  */
460 #define SCALE_RED             16
461 #define SCALE(x)              ((x) << SCALE_RED)
462 #define SCALE_VAL(x)                    ((x) >> SCALE_RED)
463 #define SCALE_MUL(x, y)                 (((x) * (y)) >> SCALE_RED)
464 
465 /*
466  * Maximum pipe number
467  */
468 #define DN_PIPE_NR_MAX                  65536
469 
470 #endif
471 #endif /* !_IP_DUMMYNET_H */
472