xref: /freebsd-13-stable/sys/net/altq/altq_rmclass.c (revision 4b40a16f0d188422227478889b38cc341d50f88f)
1 /*-
2  * Copyright (c) 1991-1997 Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the Network Research
16  *      Group at Lawrence Berkeley Laboratory.
17  * 4. Neither the name of the University nor of the Laboratory may be used
18  *    to endorse or promote products derived from this software without
19  *    specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * LBL code modified by speer@eng.sun.com, May 1977.
34  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
35  *
36  * @(#)rm_class.c  1.48     97/12/05 SMI
37  * $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $
38  */
39 #include "opt_altq.h"
40 #include "opt_inet.h"
41 #include "opt_inet6.h"
42 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
43 
44 #include <sys/param.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/socket.h>
48 #include <sys/systm.h>
49 #include <sys/errno.h>
50 #include <sys/time.h>
51 
52 #include <net/if.h>
53 #include <net/if_var.h>
54 
55 #include <net/altq/if_altq.h>
56 #include <net/altq/altq.h>
57 #include <net/altq/altq_codel.h>
58 #include <net/altq/altq_rmclass.h>
59 #include <net/altq/altq_rmclass_debug.h>
60 #include <net/altq/altq_red.h>
61 #include <net/altq/altq_rio.h>
62 
63 /*
64  * Local Macros
65  */
66 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
67 
68 /*
69  * Local routines.
70  */
71 
72 static int	rmc_satisfied(struct rm_class *, struct timeval *);
73 static void	rmc_wrr_set_weights(struct rm_ifdat *);
74 static void	rmc_depth_compute(struct rm_class *);
75 static void	rmc_depth_recompute(rm_class_t *);
76 
77 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
78 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
79 
80 static int	_rmc_addq(rm_class_t *, mbuf_t *);
81 static void	_rmc_dropq(rm_class_t *);
82 static mbuf_t	*_rmc_getq(rm_class_t *);
83 static mbuf_t	*_rmc_pollq(rm_class_t *);
84 
85 static int	rmc_under_limit(struct rm_class *, struct timeval *);
86 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
87 static void	rmc_drop_action(struct rm_class *);
88 static void	rmc_restart(void *);
89 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
90 
91 #define	BORROW_OFFTIME
92 /*
93  * BORROW_OFFTIME (experimental):
94  * borrow the offtime of the class borrowing from.
95  * the reason is that when its own offtime is set, the class is unable
96  * to borrow much, especially when cutoff is taking effect.
97  * but when the borrowed class is overloaded (advidle is close to minidle),
98  * use the borrowing class's offtime to avoid overload.
99  */
100 #define	ADJUST_CUTOFF
101 /*
102  * ADJUST_CUTOFF (experimental):
103  * if no underlimit class is found due to cutoff, increase cutoff and
104  * retry the scheduling loop.
105  * also, don't invoke delay_actions while cutoff is taking effect,
106  * since a sleeping class won't have a chance to be scheduled in the
107  * next loop.
108  *
109  * now heuristics for setting the top-level variable (cutoff_) becomes:
110  *	1. if a packet arrives for a not-overlimit class, set cutoff
111  *	   to the depth of the class.
112  *	2. if cutoff is i, and a packet arrives for an overlimit class
113  *	   with an underlimit ancestor at a lower level than i (say j),
114  *	   then set cutoff to j.
115  *	3. at scheduling a packet, if there is no underlimit class
116  *	   due to the current cutoff level, increase cutoff by 1 and
117  *	   then try to schedule again.
118  */
119 
120 /*
121  * rm_class_t *
122  * rmc_newclass(...) - Create a new resource management class at priority
123  * 'pri' on the interface given by 'ifd'.
124  *
125  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
126  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
127  *              than 100% of the bandwidth, this number should be the
128  *              'effective' rate for the class.  Let f be the
129  *              bandwidth fraction allocated to this class, and let
130  *              nsPerByte be the data rate of the output link in
131  *              nanoseconds/byte.  Then nsecPerByte is set to
132  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
133  *              for a class that gets 50% of an ethernet's bandwidth.
134  *
135  * action       the routine to call when the class is over limit.
136  *
137  * maxq         max allowable queue size for class (in packets).
138  *
139  * parent       parent class pointer.
140  *
141  * borrow       class to borrow from (should be either 'parent' or null).
142  *
143  * maxidle      max value allowed for class 'idle' time estimate (this
144  *              parameter determines how large an initial burst of packets
145  *              can be before overlimit action is invoked.
146  *
147  * offtime      how long 'delay' action will delay when class goes over
148  *              limit (this parameter determines the steady-state burst
149  *              size when a class is running over its limit).
150  *
151  * Maxidle and offtime have to be computed from the following:  If the
152  * average packet size is s, the bandwidth fraction allocated to this
153  * class is f, we want to allow b packet bursts, and the gain of the
154  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
155  *
156  *   ptime = s * nsPerByte * (1 - f) / f
157  *   maxidle = ptime * (1 - g^b) / g^b
158  *   minidle = -ptime * (1 / (f - 1))
159  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
160  *
161  * Operationally, it's convenient to specify maxidle & offtime in units
162  * independent of the link bandwidth so the maxidle & offtime passed to
163  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
164  * (The constant factor is a scale factor needed to make the parameters
165  * integers.  This scaling also means that the 'unscaled' values of
166  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
167  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
168  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
169  * maxidle also must be scaled upward by this value.  Thus, the passed
170  * values for maxidle and offtime can be computed as follows:
171  *
172  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
173  * offtime = offtime * 8 / (1000 * nsecPerByte)
174  *
175  * When USE_HRTIME is employed, then maxidle and offtime become:
176  * 	maxidle = maxilde * (8.0 / nsecPerByte);
177  * 	offtime = offtime * (8.0 / nsecPerByte);
178  */
179 struct rm_class *
rmc_newclass(int pri,struct rm_ifdat * ifd,u_int nsecPerByte,void (* action)(rm_class_t *,rm_class_t *),int maxq,struct rm_class * parent,struct rm_class * borrow,u_int maxidle,int minidle,u_int offtime,int pktsize,int flags)180 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
181     void (*action)(rm_class_t *, rm_class_t *), int maxq,
182     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
183     int minidle, u_int offtime, int pktsize, int flags)
184 {
185 	struct rm_class	*cl;
186 	struct rm_class	*peer;
187 	int		 s;
188 
189 	if (pri >= RM_MAXPRIO)
190 		return (NULL);
191 #ifndef ALTQ_RED
192 	if (flags & RMCF_RED) {
193 #ifdef ALTQ_DEBUG
194 		printf("rmc_newclass: RED not configured for CBQ!\n");
195 #endif
196 		return (NULL);
197 	}
198 #endif
199 #ifndef ALTQ_RIO
200 	if (flags & RMCF_RIO) {
201 #ifdef ALTQ_DEBUG
202 		printf("rmc_newclass: RIO not configured for CBQ!\n");
203 #endif
204 		return (NULL);
205 	}
206 #endif
207 #ifndef ALTQ_CODEL
208 	if (flags & RMCF_CODEL) {
209 #ifdef ALTQ_DEBUG
210 		printf("rmc_newclass: CODEL not configured for CBQ!\n");
211 #endif
212 		return (NULL);
213 	}
214 #endif
215 
216 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO);
217 	if (cl == NULL)
218 		return (NULL);
219 	CALLOUT_INIT(&cl->callout_);
220 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
221 	if (cl->q_ == NULL) {
222 		free(cl, M_DEVBUF);
223 		return (NULL);
224 	}
225 
226 	/*
227 	 * Class initialization.
228 	 */
229 	cl->children_ = NULL;
230 	cl->parent_ = parent;
231 	cl->borrow_ = borrow;
232 	cl->leaf_ = 1;
233 	cl->ifdat_ = ifd;
234 	cl->pri_ = pri;
235 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
236 	cl->depth_ = 0;
237 	cl->qthresh_ = 0;
238 	cl->ns_per_byte_ = nsecPerByte;
239 
240 	qlimit(cl->q_) = maxq;
241 	qtype(cl->q_) = Q_DROPHEAD;
242 	qlen(cl->q_) = 0;
243 	cl->flags_ = flags;
244 
245 #if 1 /* minidle is also scaled in ALTQ */
246 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
247 	if (cl->minidle_ > 0)
248 		cl->minidle_ = 0;
249 #else
250 	cl->minidle_ = minidle;
251 #endif
252 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
253 	if (cl->maxidle_ == 0)
254 		cl->maxidle_ = 1;
255 #if 1 /* offtime is also scaled in ALTQ */
256 	cl->avgidle_ = cl->maxidle_;
257 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
258 	if (cl->offtime_ == 0)
259 		cl->offtime_ = 1;
260 #else
261 	cl->avgidle_ = 0;
262 	cl->offtime_ = (offtime * nsecPerByte) / 8;
263 #endif
264 	cl->overlimit = action;
265 
266 #ifdef ALTQ_RED
267 	if (flags & (RMCF_RED|RMCF_RIO)) {
268 		int red_flags, red_pkttime;
269 
270 		red_flags = 0;
271 		if (flags & RMCF_ECN)
272 			red_flags |= REDF_ECN;
273 		if (flags & RMCF_FLOWVALVE)
274 			red_flags |= REDF_FLOWVALVE;
275 #ifdef ALTQ_RIO
276 		if (flags & RMCF_CLEARDSCP)
277 			red_flags |= RIOF_CLEARDSCP;
278 #endif
279 		red_pkttime = nsecPerByte * pktsize  / 1000;
280 
281 		if (flags & RMCF_RED) {
282 			cl->red_ = red_alloc(0, 0,
283 			    qlimit(cl->q_) * 10/100,
284 			    qlimit(cl->q_) * 30/100,
285 			    red_flags, red_pkttime);
286 			if (cl->red_ != NULL)
287 				qtype(cl->q_) = Q_RED;
288 		}
289 #ifdef ALTQ_RIO
290 		else {
291 			cl->red_ = (red_t *)rio_alloc(0, NULL,
292 						      red_flags, red_pkttime);
293 			if (cl->red_ != NULL)
294 				qtype(cl->q_) = Q_RIO;
295 		}
296 #endif
297 	}
298 #endif /* ALTQ_RED */
299 #ifdef ALTQ_CODEL
300 	if (flags & RMCF_CODEL) {
301 		cl->codel_ = codel_alloc(5, 100, 0);
302 		if (cl->codel_ != NULL)
303 			qtype(cl->q_) = Q_CODEL;
304 	}
305 #endif
306 
307 	/*
308 	 * put the class into the class tree
309 	 */
310 	s = splnet();
311 	IFQ_LOCK(ifd->ifq_);
312 	if ((peer = ifd->active_[pri]) != NULL) {
313 		/* find the last class at this pri */
314 		cl->peer_ = peer;
315 		while (peer->peer_ != ifd->active_[pri])
316 			peer = peer->peer_;
317 		peer->peer_ = cl;
318 	} else {
319 		ifd->active_[pri] = cl;
320 		cl->peer_ = cl;
321 	}
322 
323 	if (cl->parent_) {
324 		cl->next_ = parent->children_;
325 		parent->children_ = cl;
326 		parent->leaf_ = 0;
327 	}
328 
329 	/*
330 	 * Compute the depth of this class and its ancestors in the class
331 	 * hierarchy.
332 	 */
333 	rmc_depth_compute(cl);
334 
335 	/*
336 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
337 	 */
338 	if (ifd->wrr_) {
339 		ifd->num_[pri]++;
340 		ifd->alloc_[pri] += cl->allotment_;
341 		rmc_wrr_set_weights(ifd);
342 	}
343 	IFQ_UNLOCK(ifd->ifq_);
344 	splx(s);
345 	return (cl);
346 }
347 
348 int
rmc_modclass(struct rm_class * cl,u_int nsecPerByte,int maxq,u_int maxidle,int minidle,u_int offtime,int pktsize)349 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
350     int minidle, u_int offtime, int pktsize)
351 {
352 	struct rm_ifdat	*ifd;
353 	u_int		 old_allotment;
354 	int		 s;
355 
356 	ifd = cl->ifdat_;
357 	old_allotment = cl->allotment_;
358 
359 	s = splnet();
360 	IFQ_LOCK(ifd->ifq_);
361 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
362 	cl->qthresh_ = 0;
363 	cl->ns_per_byte_ = nsecPerByte;
364 
365 	qlimit(cl->q_) = maxq;
366 
367 #if 1 /* minidle is also scaled in ALTQ */
368 	cl->minidle_ = (minidle * nsecPerByte) / 8;
369 	if (cl->minidle_ > 0)
370 		cl->minidle_ = 0;
371 #else
372 	cl->minidle_ = minidle;
373 #endif
374 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
375 	if (cl->maxidle_ == 0)
376 		cl->maxidle_ = 1;
377 #if 1 /* offtime is also scaled in ALTQ */
378 	cl->avgidle_ = cl->maxidle_;
379 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
380 	if (cl->offtime_ == 0)
381 		cl->offtime_ = 1;
382 #else
383 	cl->avgidle_ = 0;
384 	cl->offtime_ = (offtime * nsecPerByte) / 8;
385 #endif
386 
387 	/*
388 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
389 	 */
390 	if (ifd->wrr_) {
391 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
392 		rmc_wrr_set_weights(ifd);
393 	}
394 	IFQ_UNLOCK(ifd->ifq_);
395 	splx(s);
396 	return (0);
397 }
398 
399 /*
400  * static void
401  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
402  *	the appropriate run robin weights for the CBQ weighted round robin
403  *	algorithm.
404  *
405  *	Returns: NONE
406  */
407 
408 static void
rmc_wrr_set_weights(struct rm_ifdat * ifd)409 rmc_wrr_set_weights(struct rm_ifdat *ifd)
410 {
411 	int		i;
412 	struct rm_class	*cl, *clh;
413 
414 	for (i = 0; i < RM_MAXPRIO; i++) {
415 		/*
416 		 * This is inverted from that of the simulator to
417 		 * maintain precision.
418 		 */
419 		if (ifd->num_[i] == 0)
420 			ifd->M_[i] = 0;
421 		else
422 			ifd->M_[i] = ifd->alloc_[i] /
423 				(ifd->num_[i] * ifd->maxpkt_);
424 		/*
425 		 * Compute the weighted allotment for each class.
426 		 * This takes the expensive div instruction out
427 		 * of the main loop for the wrr scheduling path.
428 		 * These only get recomputed when a class comes or
429 		 * goes.
430 		 */
431 		if (ifd->active_[i] != NULL) {
432 			clh = cl = ifd->active_[i];
433 			do {
434 				/* safe-guard for slow link or alloc_ == 0 */
435 				if (ifd->M_[i] == 0)
436 					cl->w_allotment_ = 0;
437 				else
438 					cl->w_allotment_ = cl->allotment_ /
439 						ifd->M_[i];
440 				cl = cl->peer_;
441 			} while ((cl != NULL) && (cl != clh));
442 		}
443 	}
444 }
445 
446 int
rmc_get_weight(struct rm_ifdat * ifd,int pri)447 rmc_get_weight(struct rm_ifdat *ifd, int pri)
448 {
449 	if ((pri >= 0) && (pri < RM_MAXPRIO))
450 		return (ifd->M_[pri]);
451 	else
452 		return (0);
453 }
454 
455 /*
456  * static void
457  * rmc_depth_compute(struct rm_class *cl) - This function computes the
458  *	appropriate depth of class 'cl' and its ancestors.
459  *
460  *	Returns:	NONE
461  */
462 
463 static void
rmc_depth_compute(struct rm_class * cl)464 rmc_depth_compute(struct rm_class *cl)
465 {
466 	rm_class_t	*t = cl, *p;
467 
468 	/*
469 	 * Recompute the depth for the branch of the tree.
470 	 */
471 	while (t != NULL) {
472 		p = t->parent_;
473 		if (p && (t->depth_ >= p->depth_)) {
474 			p->depth_ = t->depth_ + 1;
475 			t = p;
476 		} else
477 			t = NULL;
478 	}
479 }
480 
481 /*
482  * static void
483  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
484  *	the depth of the tree after a class has been deleted.
485  *
486  *	Returns:	NONE
487  */
488 
489 static void
rmc_depth_recompute(rm_class_t * cl)490 rmc_depth_recompute(rm_class_t *cl)
491 {
492 #if 1 /* ALTQ */
493 	rm_class_t	*p, *t;
494 
495 	p = cl;
496 	while (p != NULL) {
497 		if ((t = p->children_) == NULL) {
498 			p->depth_ = 0;
499 		} else {
500 			int cdepth = 0;
501 
502 			while (t != NULL) {
503 				if (t->depth_ > cdepth)
504 					cdepth = t->depth_;
505 				t = t->next_;
506 			}
507 
508 			if (p->depth_ == cdepth + 1)
509 				/* no change to this parent */
510 				return;
511 
512 			p->depth_ = cdepth + 1;
513 		}
514 
515 		p = p->parent_;
516 	}
517 #else
518 	rm_class_t	*t;
519 
520 	if (cl->depth_ >= 1) {
521 		if (cl->children_ == NULL) {
522 			cl->depth_ = 0;
523 		} else if ((t = cl->children_) != NULL) {
524 			while (t != NULL) {
525 				if (t->children_ != NULL)
526 					rmc_depth_recompute(t);
527 				t = t->next_;
528 			}
529 		} else
530 			rmc_depth_compute(cl);
531 	}
532 #endif
533 }
534 
535 /*
536  * void
537  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
538  *	function deletes a class from the link-sharing structure and frees
539  *	all resources associated with the class.
540  *
541  *	Returns: NONE
542  */
543 
544 void
rmc_delete_class(struct rm_ifdat * ifd,struct rm_class * cl)545 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
546 {
547 	struct rm_class	*p, *head, *previous;
548 	int		 s;
549 
550 	ASSERT(cl->children_ == NULL);
551 
552 	if (cl->sleeping_)
553 		CALLOUT_STOP(&cl->callout_);
554 
555 	s = splnet();
556 	IFQ_LOCK(ifd->ifq_);
557 	/*
558 	 * Free packets in the packet queue.
559 	 * XXX - this may not be a desired behavior.  Packets should be
560 	 *		re-queued.
561 	 */
562 	rmc_dropall(cl);
563 
564 	/*
565 	 * If the class has a parent, then remove the class from the
566 	 * class from the parent's children chain.
567 	 */
568 	if (cl->parent_ != NULL) {
569 		head = cl->parent_->children_;
570 		p = previous = head;
571 		if (head->next_ == NULL) {
572 			ASSERT(head == cl);
573 			cl->parent_->children_ = NULL;
574 			cl->parent_->leaf_ = 1;
575 		} else while (p != NULL) {
576 			if (p == cl) {
577 				if (cl == head)
578 					cl->parent_->children_ = cl->next_;
579 				else
580 					previous->next_ = cl->next_;
581 				cl->next_ = NULL;
582 				p = NULL;
583 			} else {
584 				previous = p;
585 				p = p->next_;
586 			}
587 		}
588 	}
589 
590 	/*
591 	 * Delete class from class priority peer list.
592 	 */
593 	if ((p = ifd->active_[cl->pri_]) != NULL) {
594 		/*
595 		 * If there is more than one member of this priority
596 		 * level, then look for class(cl) in the priority level.
597 		 */
598 		if (p != p->peer_) {
599 			while (p->peer_ != cl)
600 				p = p->peer_;
601 			p->peer_ = cl->peer_;
602 
603 			if (ifd->active_[cl->pri_] == cl)
604 				ifd->active_[cl->pri_] = cl->peer_;
605 		} else {
606 			ASSERT(p == cl);
607 			ifd->active_[cl->pri_] = NULL;
608 		}
609 	}
610 
611 	/*
612 	 * Recompute the WRR weights.
613 	 */
614 	if (ifd->wrr_) {
615 		ifd->alloc_[cl->pri_] -= cl->allotment_;
616 		ifd->num_[cl->pri_]--;
617 		rmc_wrr_set_weights(ifd);
618 	}
619 
620 	/*
621 	 * Re-compute the depth of the tree.
622 	 */
623 #if 1 /* ALTQ */
624 	rmc_depth_recompute(cl->parent_);
625 #else
626 	rmc_depth_recompute(ifd->root_);
627 #endif
628 
629 	IFQ_UNLOCK(ifd->ifq_);
630 	splx(s);
631 
632 	/*
633 	 * Free the class structure.
634 	 */
635 	if (cl->red_ != NULL) {
636 #ifdef ALTQ_RIO
637 		if (q_is_rio(cl->q_))
638 			rio_destroy((rio_t *)cl->red_);
639 #endif
640 #ifdef ALTQ_RED
641 		if (q_is_red(cl->q_))
642 			red_destroy(cl->red_);
643 #endif
644 #ifdef ALTQ_CODEL
645 		if (q_is_codel(cl->q_))
646 			codel_destroy(cl->codel_);
647 #endif
648 	}
649 	free(cl->q_, M_DEVBUF);
650 	free(cl, M_DEVBUF);
651 }
652 
653 /*
654  * void
655  * rmc_init(...) - Initialize the resource management data structures
656  *	associated with the output portion of interface 'ifp'.  'ifd' is
657  *	where the structures will be built (for backwards compatibility, the
658  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
659  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
660  *	'restart' is the driver-specific routine that the generic 'delay
661  *	until under limit' action will call to restart output.  `maxq'
662  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
663  *	is the maximum number of packets that the resource management
664  *	code will allow to be queued 'downstream' (this is typically 1).
665  *
666  *	Returns:	NONE
667  */
668 
669 void
rmc_init(struct ifaltq * ifq,struct rm_ifdat * ifd,u_int nsecPerByte,void (* restart)(struct ifaltq *),int maxq,int maxqueued,u_int maxidle,int minidle,u_int offtime,int flags)670 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
671     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
672     int minidle, u_int offtime, int flags)
673 {
674 	int		i, mtu;
675 
676 	/*
677 	 * Initialize the CBQ tracing/debug facility.
678 	 */
679 	CBQTRACEINIT();
680 
681 	bzero((char *)ifd, sizeof (*ifd));
682 	mtu = ifq->altq_ifp->if_mtu;
683 	ifd->ifq_ = ifq;
684 	ifd->restart = restart;
685 	ifd->maxqueued_ = maxqueued;
686 	ifd->ns_per_byte_ = nsecPerByte;
687 	ifd->maxpkt_ = mtu;
688 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
689 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
690 #if 1
691 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
692 	if (mtu * nsecPerByte > 10 * 1000000)
693 		ifd->maxiftime_ /= 4;
694 #endif
695 
696 	reset_cutoff(ifd);
697 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
698 
699 	/*
700 	 * Initialize the CBQ's WRR state.
701 	 */
702 	for (i = 0; i < RM_MAXPRIO; i++) {
703 		ifd->alloc_[i] = 0;
704 		ifd->M_[i] = 0;
705 		ifd->num_[i] = 0;
706 		ifd->na_[i] = 0;
707 		ifd->active_[i] = NULL;
708 	}
709 
710 	/*
711 	 * Initialize current packet state.
712 	 */
713 	ifd->qi_ = 0;
714 	ifd->qo_ = 0;
715 	for (i = 0; i < RM_MAXQUEUED; i++) {
716 		ifd->class_[i] = NULL;
717 		ifd->curlen_[i] = 0;
718 		ifd->borrowed_[i] = NULL;
719 	}
720 
721 	/*
722 	 * Create the root class of the link-sharing structure.
723 	 */
724 	if ((ifd->root_ = rmc_newclass(0, ifd,
725 				       nsecPerByte,
726 				       rmc_root_overlimit, maxq, 0, 0,
727 				       maxidle, minidle, offtime,
728 				       0, 0)) == NULL) {
729 		printf("rmc_init: root class not allocated\n");
730 		return ;
731 	}
732 	ifd->root_->depth_ = 0;
733 }
734 
735 /*
736  * void
737  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
738  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
739  *	by a driver's if_output routine.  This routine must be called with
740  *	output packet completion interrupts locked out (to avoid racing with
741  *	rmc_dequeue_next).
742  *
743  *	Returns:	0 on successful queueing
744  *			-1 when packet drop occurs
745  */
746 int
rmc_queue_packet(struct rm_class * cl,mbuf_t * m)747 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
748 {
749 	struct timeval	 now;
750 	struct rm_ifdat *ifd = cl->ifdat_;
751 	int		 cpri = cl->pri_;
752 	int		 is_empty = qempty(cl->q_);
753 
754 	RM_GETTIME(now);
755 	if (ifd->cutoff_ > 0) {
756 		if (TV_LT(&cl->undertime_, &now)) {
757 			if (ifd->cutoff_ > cl->depth_)
758 				ifd->cutoff_ = cl->depth_;
759 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
760 		}
761 #if 1 /* ALTQ */
762 		else {
763 			/*
764 			 * the class is overlimit. if the class has
765 			 * underlimit ancestors, set cutoff to the lowest
766 			 * depth among them.
767 			 */
768 			struct rm_class *borrow = cl->borrow_;
769 
770 			while (borrow != NULL &&
771 			       borrow->depth_ < ifd->cutoff_) {
772 				if (TV_LT(&borrow->undertime_, &now)) {
773 					ifd->cutoff_ = borrow->depth_;
774 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
775 					break;
776 				}
777 				borrow = borrow->borrow_;
778 			}
779 		}
780 #else /* !ALTQ */
781 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
782 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
783 				ifd->cutoff_ = cl->borrow_->depth_;
784 				CBQTRACE(rmc_queue_packet, 'ffob',
785 					 cl->borrow_->depth_);
786 			}
787 		}
788 #endif /* !ALTQ */
789 	}
790 
791 	if (_rmc_addq(cl, m) < 0)
792 		/* failed */
793 		return (-1);
794 
795 	if (is_empty) {
796 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
797 		ifd->na_[cpri]++;
798 	}
799 
800 	if (qlen(cl->q_) > qlimit(cl->q_)) {
801 		/* note: qlimit can be set to 0 or 1 */
802 		rmc_drop_action(cl);
803 		return (-1);
804 	}
805 	return (0);
806 }
807 
808 /*
809  * void
810  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
811  *	classes to see if there are satified.
812  */
813 
814 static void
rmc_tl_satisfied(struct rm_ifdat * ifd,struct timeval * now)815 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
816 {
817 	int		 i;
818 	rm_class_t	*p, *bp;
819 
820 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
821 		if ((bp = ifd->active_[i]) != NULL) {
822 			p = bp;
823 			do {
824 				if (!rmc_satisfied(p, now)) {
825 					ifd->cutoff_ = p->depth_;
826 					return;
827 				}
828 				p = p->peer_;
829 			} while (p != bp);
830 		}
831 	}
832 
833 	reset_cutoff(ifd);
834 }
835 
836 /*
837  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
838  */
839 
840 static int
rmc_satisfied(struct rm_class * cl,struct timeval * now)841 rmc_satisfied(struct rm_class *cl, struct timeval *now)
842 {
843 	rm_class_t	*p;
844 
845 	if (cl == NULL)
846 		return (1);
847 	if (TV_LT(now, &cl->undertime_))
848 		return (1);
849 	if (cl->depth_ == 0) {
850 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
851 			return (0);
852 		else
853 			return (1);
854 	}
855 	if (cl->children_ != NULL) {
856 		p = cl->children_;
857 		while (p != NULL) {
858 			if (!rmc_satisfied(p, now))
859 				return (0);
860 			p = p->next_;
861 		}
862 	}
863 
864 	return (1);
865 }
866 
867 /*
868  * Return 1 if class 'cl' is under limit or can borrow from a parent,
869  * 0 if overlimit.  As a side-effect, this routine will invoke the
870  * class overlimit action if the class if overlimit.
871  */
872 
873 static int
rmc_under_limit(struct rm_class * cl,struct timeval * now)874 rmc_under_limit(struct rm_class *cl, struct timeval *now)
875 {
876 	rm_class_t	*p = cl;
877 	rm_class_t	*top;
878 	struct rm_ifdat	*ifd = cl->ifdat_;
879 
880 	ifd->borrowed_[ifd->qi_] = NULL;
881 	/*
882 	 * If cl is the root class, then always return that it is
883 	 * underlimit.  Otherwise, check to see if the class is underlimit.
884 	 */
885 	if (cl->parent_ == NULL)
886 		return (1);
887 
888 	if (cl->sleeping_) {
889 		if (TV_LT(now, &cl->undertime_))
890 			return (0);
891 
892 		CALLOUT_STOP(&cl->callout_);
893 		cl->sleeping_ = 0;
894 		cl->undertime_.tv_sec = 0;
895 		return (1);
896 	}
897 
898 	top = NULL;
899 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
900 		if (((cl = cl->borrow_) == NULL) ||
901 		    (cl->depth_ > ifd->cutoff_)) {
902 #ifdef ADJUST_CUTOFF
903 			if (cl != NULL)
904 				/* cutoff is taking effect, just
905 				   return false without calling
906 				   the delay action. */
907 				return (0);
908 #endif
909 #ifdef BORROW_OFFTIME
910 			/*
911 			 * check if the class can borrow offtime too.
912 			 * borrow offtime from the top of the borrow
913 			 * chain if the top class is not overloaded.
914 			 */
915 			if (cl != NULL) {
916 				/* cutoff is taking effect, use this class as top. */
917 				top = cl;
918 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
919 			}
920 			if (top != NULL && top->avgidle_ == top->minidle_)
921 				top = NULL;
922 			p->overtime_ = *now;
923 			(p->overlimit)(p, top);
924 #else
925 			p->overtime_ = *now;
926 			(p->overlimit)(p, NULL);
927 #endif
928 			return (0);
929 		}
930 		top = cl;
931 	}
932 
933 	if (cl != p)
934 		ifd->borrowed_[ifd->qi_] = cl;
935 	return (1);
936 }
937 
938 /*
939  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
940  *	Packet-by-packet round robin.
941  *
942  * The heart of the weighted round-robin scheduler, which decides which
943  * class next gets to send a packet.  Highest priority first, then
944  * weighted round-robin within priorites.
945  *
946  * Each able-to-send class gets to send until its byte allocation is
947  * exhausted.  Thus, the active pointer is only changed after a class has
948  * exhausted its allocation.
949  *
950  * If the scheduler finds no class that is underlimit or able to borrow,
951  * then the first class found that had a nonzero queue and is allowed to
952  * borrow gets to send.
953  */
954 
955 static mbuf_t *
_rmc_wrr_dequeue_next(struct rm_ifdat * ifd,int op)956 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
957 {
958 	struct rm_class	*cl = NULL, *first = NULL;
959 	u_int		 deficit;
960 	int		 cpri;
961 	mbuf_t		*m;
962 	struct timeval	 now;
963 
964 	RM_GETTIME(now);
965 
966 	/*
967 	 * if the driver polls the top of the queue and then removes
968 	 * the polled packet, we must return the same packet.
969 	 */
970 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
971 		cl = ifd->pollcache_;
972 		cpri = cl->pri_;
973 		if (ifd->efficient_) {
974 			/* check if this class is overlimit */
975 			if (cl->undertime_.tv_sec != 0 &&
976 			    rmc_under_limit(cl, &now) == 0)
977 				first = cl;
978 		}
979 		ifd->pollcache_ = NULL;
980 		goto _wrr_out;
981 	}
982 	else {
983 		/* mode == ALTDQ_POLL || pollcache == NULL */
984 		ifd->pollcache_ = NULL;
985 		ifd->borrowed_[ifd->qi_] = NULL;
986 	}
987 #ifdef ADJUST_CUTOFF
988  _again:
989 #endif
990 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
991 		if (ifd->na_[cpri] == 0)
992 			continue;
993 		deficit = 0;
994 		/*
995 		 * Loop through twice for a priority level, if some class
996 		 * was unable to send a packet the first round because
997 		 * of the weighted round-robin mechanism.
998 		 * During the second loop at this level, deficit==2.
999 		 * (This second loop is not needed if for every class,
1000 		 * "M[cl->pri_])" times "cl->allotment" is greater than
1001 		 * the byte size for the largest packet in the class.)
1002 		 */
1003  _wrr_loop:
1004 		cl = ifd->active_[cpri];
1005 		ASSERT(cl != NULL);
1006 		do {
1007 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1008 				cl->bytes_alloc_ += cl->w_allotment_;
1009 			if (!qempty(cl->q_)) {
1010 				if ((cl->undertime_.tv_sec == 0) ||
1011 				    rmc_under_limit(cl, &now)) {
1012 					if (cl->bytes_alloc_ > 0 || deficit > 1)
1013 						goto _wrr_out;
1014 
1015 					/* underlimit but no alloc */
1016 					deficit = 1;
1017 #if 1
1018 					ifd->borrowed_[ifd->qi_] = NULL;
1019 #endif
1020 				}
1021 				else if (first == NULL && cl->borrow_ != NULL)
1022 					first = cl; /* borrowing candidate */
1023 			}
1024 
1025 			cl->bytes_alloc_ = 0;
1026 			cl = cl->peer_;
1027 		} while (cl != ifd->active_[cpri]);
1028 
1029 		if (deficit == 1) {
1030 			/* first loop found an underlimit class with deficit */
1031 			/* Loop on same priority level, with new deficit.  */
1032 			deficit = 2;
1033 			goto _wrr_loop;
1034 		}
1035 	}
1036 
1037 #ifdef ADJUST_CUTOFF
1038 	/*
1039 	 * no underlimit class found.  if cutoff is taking effect,
1040 	 * increase cutoff and try again.
1041 	 */
1042 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1043 		ifd->cutoff_++;
1044 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1045 		goto _again;
1046 	}
1047 #endif /* ADJUST_CUTOFF */
1048 	/*
1049 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1050 	 * class we encounter will send a packet if all the classes
1051 	 * of the link-sharing structure are overlimit.
1052 	 */
1053 	reset_cutoff(ifd);
1054 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1055 
1056 	if (!ifd->efficient_ || first == NULL)
1057 		return (NULL);
1058 
1059 	cl = first;
1060 	cpri = cl->pri_;
1061 #if 0	/* too time-consuming for nothing */
1062 	if (cl->sleeping_)
1063 		CALLOUT_STOP(&cl->callout_);
1064 	cl->sleeping_ = 0;
1065 	cl->undertime_.tv_sec = 0;
1066 #endif
1067 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1068 	ifd->cutoff_ = cl->borrow_->depth_;
1069 
1070 	/*
1071 	 * Deque the packet and do the book keeping...
1072 	 */
1073  _wrr_out:
1074 	if (op == ALTDQ_REMOVE) {
1075 		m = _rmc_getq(cl);
1076 		if (m == NULL)
1077 			panic("_rmc_wrr_dequeue_next");
1078 		if (qempty(cl->q_))
1079 			ifd->na_[cpri]--;
1080 
1081 		/*
1082 		 * Update class statistics and link data.
1083 		 */
1084 		if (cl->bytes_alloc_ > 0)
1085 			cl->bytes_alloc_ -= m_pktlen(m);
1086 
1087 		if ((cl->bytes_alloc_ <= 0) || first == cl)
1088 			ifd->active_[cl->pri_] = cl->peer_;
1089 		else
1090 			ifd->active_[cl->pri_] = cl;
1091 
1092 		ifd->class_[ifd->qi_] = cl;
1093 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1094 		ifd->now_[ifd->qi_] = now;
1095 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1096 		ifd->queued_++;
1097 	} else {
1098 		/* mode == ALTDQ_PPOLL */
1099 		m = _rmc_pollq(cl);
1100 		ifd->pollcache_ = cl;
1101 	}
1102 	return (m);
1103 }
1104 
1105 /*
1106  * Dequeue & return next packet from the highest priority class that
1107  * has a packet to send & has enough allocation to send it.  This
1108  * routine is called by a driver whenever it needs a new packet to
1109  * output.
1110  */
1111 static mbuf_t *
_rmc_prr_dequeue_next(struct rm_ifdat * ifd,int op)1112 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1113 {
1114 	mbuf_t		*m;
1115 	int		 cpri;
1116 	struct rm_class	*cl, *first = NULL;
1117 	struct timeval	 now;
1118 
1119 	RM_GETTIME(now);
1120 
1121 	/*
1122 	 * if the driver polls the top of the queue and then removes
1123 	 * the polled packet, we must return the same packet.
1124 	 */
1125 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1126 		cl = ifd->pollcache_;
1127 		cpri = cl->pri_;
1128 		ifd->pollcache_ = NULL;
1129 		goto _prr_out;
1130 	} else {
1131 		/* mode == ALTDQ_POLL || pollcache == NULL */
1132 		ifd->pollcache_ = NULL;
1133 		ifd->borrowed_[ifd->qi_] = NULL;
1134 	}
1135 #ifdef ADJUST_CUTOFF
1136  _again:
1137 #endif
1138 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1139 		if (ifd->na_[cpri] == 0)
1140 			continue;
1141 		cl = ifd->active_[cpri];
1142 		ASSERT(cl != NULL);
1143 		do {
1144 			if (!qempty(cl->q_)) {
1145 				if ((cl->undertime_.tv_sec == 0) ||
1146 				    rmc_under_limit(cl, &now))
1147 					goto _prr_out;
1148 				if (first == NULL && cl->borrow_ != NULL)
1149 					first = cl;
1150 			}
1151 			cl = cl->peer_;
1152 		} while (cl != ifd->active_[cpri]);
1153 	}
1154 
1155 #ifdef ADJUST_CUTOFF
1156 	/*
1157 	 * no underlimit class found.  if cutoff is taking effect, increase
1158 	 * cutoff and try again.
1159 	 */
1160 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1161 		ifd->cutoff_++;
1162 		goto _again;
1163 	}
1164 #endif /* ADJUST_CUTOFF */
1165 	/*
1166 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1167 	 * class we encounter will send a packet if all the classes
1168 	 * of the link-sharing structure are overlimit.
1169 	 */
1170 	reset_cutoff(ifd);
1171 	if (!ifd->efficient_ || first == NULL)
1172 		return (NULL);
1173 
1174 	cl = first;
1175 	cpri = cl->pri_;
1176 #if 0	/* too time-consuming for nothing */
1177 	if (cl->sleeping_)
1178 		CALLOUT_STOP(&cl->callout_);
1179 	cl->sleeping_ = 0;
1180 	cl->undertime_.tv_sec = 0;
1181 #endif
1182 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1183 	ifd->cutoff_ = cl->borrow_->depth_;
1184 
1185 	/*
1186 	 * Deque the packet and do the book keeping...
1187 	 */
1188  _prr_out:
1189 	if (op == ALTDQ_REMOVE) {
1190 		m = _rmc_getq(cl);
1191 		if (m == NULL)
1192 			panic("_rmc_prr_dequeue_next");
1193 		if (qempty(cl->q_))
1194 			ifd->na_[cpri]--;
1195 
1196 		ifd->active_[cpri] = cl->peer_;
1197 
1198 		ifd->class_[ifd->qi_] = cl;
1199 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1200 		ifd->now_[ifd->qi_] = now;
1201 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1202 		ifd->queued_++;
1203 	} else {
1204 		/* mode == ALTDQ_POLL */
1205 		m = _rmc_pollq(cl);
1206 		ifd->pollcache_ = cl;
1207 	}
1208 	return (m);
1209 }
1210 
1211 /*
1212  * mbuf_t *
1213  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1214  *	is invoked by the packet driver to get the next packet to be
1215  *	dequeued and output on the link.  If WRR is enabled, then the
1216  *	WRR dequeue next routine will determine the next packet to sent.
1217  *	Otherwise, packet-by-packet round robin is invoked.
1218  *
1219  *	Returns:	NULL, if a packet is not available or if all
1220  *			classes are overlimit.
1221  *
1222  *			Otherwise, Pointer to the next packet.
1223  */
1224 
1225 mbuf_t *
rmc_dequeue_next(struct rm_ifdat * ifd,int mode)1226 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1227 {
1228 	if (ifd->queued_ >= ifd->maxqueued_)
1229 		return (NULL);
1230 	else if (ifd->wrr_)
1231 		return (_rmc_wrr_dequeue_next(ifd, mode));
1232 	else
1233 		return (_rmc_prr_dequeue_next(ifd, mode));
1234 }
1235 
1236 /*
1237  * Update the utilization estimate for the packet that just completed.
1238  * The packet's class & the parent(s) of that class all get their
1239  * estimators updated.  This routine is called by the driver's output-
1240  * packet-completion interrupt service routine.
1241  */
1242 
1243 /*
1244  * a macro to approximate "divide by 1000" that gives 0.000999,
1245  * if a value has enough effective digits.
1246  * (on pentium, mul takes 9 cycles but div takes 46!)
1247  */
1248 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1249 void
rmc_update_class_util(struct rm_ifdat * ifd)1250 rmc_update_class_util(struct rm_ifdat *ifd)
1251 {
1252 	int		 idle, avgidle, pktlen;
1253 	int		 pkt_time, tidle;
1254 	rm_class_t	*cl, *borrowed;
1255 	rm_class_t	*borrows;
1256 	struct timeval	*nowp;
1257 
1258 	/*
1259 	 * Get the most recent completed class.
1260 	 */
1261 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1262 		return;
1263 
1264 	pktlen = ifd->curlen_[ifd->qo_];
1265 	borrowed = ifd->borrowed_[ifd->qo_];
1266 	borrows = borrowed;
1267 
1268 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1269 
1270 	/*
1271 	 * Run estimator on class and its ancestors.
1272 	 */
1273 	/*
1274 	 * rm_update_class_util is designed to be called when the
1275 	 * transfer is completed from a xmit complete interrupt,
1276 	 * but most drivers don't implement an upcall for that.
1277 	 * so, just use estimated completion time.
1278 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1279 	 */
1280 	nowp = &ifd->now_[ifd->qo_];
1281 	/* get pkt_time (for link) in usec */
1282 #if 1  /* use approximation */
1283 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1284 	pkt_time = NSEC_TO_USEC(pkt_time);
1285 #else
1286 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1287 #endif
1288 #if 1 /* ALTQ4PPP */
1289 	if (TV_LT(nowp, &ifd->ifnow_)) {
1290 		int iftime;
1291 
1292 		/*
1293 		 * make sure the estimated completion time does not go
1294 		 * too far.  it can happen when the link layer supports
1295 		 * data compression or the interface speed is set to
1296 		 * a much lower value.
1297 		 */
1298 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1299 		if (iftime+pkt_time < ifd->maxiftime_) {
1300 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1301 		} else {
1302 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1303 		}
1304 	} else {
1305 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1306 	}
1307 #else
1308 	if (TV_LT(nowp, &ifd->ifnow_)) {
1309 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1310 	} else {
1311 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1312 	}
1313 #endif
1314 
1315 	while (cl != NULL) {
1316 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1317 		if (idle >= 2000000)
1318 			/*
1319 			 * this class is idle enough, reset avgidle.
1320 			 * (TV_DELTA returns 2000000 us when delta is large.)
1321 			 */
1322 			cl->avgidle_ = cl->maxidle_;
1323 
1324 		/* get pkt_time (for class) in usec */
1325 #if 1  /* use approximation */
1326 		pkt_time = pktlen * cl->ns_per_byte_;
1327 		pkt_time = NSEC_TO_USEC(pkt_time);
1328 #else
1329 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1330 #endif
1331 		idle -= pkt_time;
1332 
1333 		avgidle = cl->avgidle_;
1334 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1335 		cl->avgidle_ = avgidle;
1336 
1337 		/* Are we overlimit ? */
1338 		if (avgidle <= 0) {
1339 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1340 #if 1 /* ALTQ */
1341 			/*
1342 			 * need some lower bound for avgidle, otherwise
1343 			 * a borrowing class gets unbounded penalty.
1344 			 */
1345 			if (avgidle < cl->minidle_)
1346 				avgidle = cl->avgidle_ = cl->minidle_;
1347 #endif
1348 			/* set next idle to make avgidle 0 */
1349 			tidle = pkt_time +
1350 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1351 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1352 			++cl->stats_.over;
1353 		} else {
1354 			cl->avgidle_ =
1355 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1356 			cl->undertime_.tv_sec = 0;
1357 			if (cl->sleeping_) {
1358 				CALLOUT_STOP(&cl->callout_);
1359 				cl->sleeping_ = 0;
1360 			}
1361 		}
1362 
1363 		if (borrows != NULL) {
1364 			if (borrows != cl)
1365 				++cl->stats_.borrows;
1366 			else
1367 				borrows = NULL;
1368 		}
1369 		cl->last_ = ifd->ifnow_;
1370 		cl->last_pkttime_ = pkt_time;
1371 
1372 #if 1
1373 		if (cl->parent_ == NULL) {
1374 			/* take stats of root class */
1375 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1376 		}
1377 #endif
1378 
1379 		cl = cl->parent_;
1380 	}
1381 
1382 	/*
1383 	 * Check to see if cutoff needs to set to a new level.
1384 	 */
1385 	cl = ifd->class_[ifd->qo_];
1386 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1387 #if 1 /* ALTQ */
1388 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1389 			rmc_tl_satisfied(ifd, nowp);
1390 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1391 		} else {
1392 			ifd->cutoff_ = borrowed->depth_;
1393 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1394 		}
1395 #else /* !ALTQ */
1396 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1397 			reset_cutoff(ifd);
1398 #ifdef notdef
1399 			rmc_tl_satisfied(ifd, &now);
1400 #endif
1401 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1402 		} else {
1403 			ifd->cutoff_ = borrowed->depth_;
1404 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1405 		}
1406 #endif /* !ALTQ */
1407 	}
1408 
1409 	/*
1410 	 * Release class slot
1411 	 */
1412 	ifd->borrowed_[ifd->qo_] = NULL;
1413 	ifd->class_[ifd->qo_] = NULL;
1414 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1415 	ifd->queued_--;
1416 }
1417 
1418 /*
1419  * void
1420  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1421  *	over-limit action routines.  These get invoked by rmc_under_limit()
1422  *	if a class with packets to send if over its bandwidth limit & can't
1423  *	borrow from a parent class.
1424  *
1425  *	Returns: NONE
1426  */
1427 
1428 static void
rmc_drop_action(struct rm_class * cl)1429 rmc_drop_action(struct rm_class *cl)
1430 {
1431 	struct rm_ifdat	*ifd = cl->ifdat_;
1432 
1433 	ASSERT(qlen(cl->q_) > 0);
1434 	_rmc_dropq(cl);
1435 	if (qempty(cl->q_))
1436 		ifd->na_[cl->pri_]--;
1437 }
1438 
rmc_dropall(struct rm_class * cl)1439 void rmc_dropall(struct rm_class *cl)
1440 {
1441 	struct rm_ifdat	*ifd = cl->ifdat_;
1442 
1443 	if (!qempty(cl->q_)) {
1444 		_flushq(cl->q_);
1445 
1446 		ifd->na_[cl->pri_]--;
1447 	}
1448 }
1449 
1450 #if (__FreeBSD_version > 300000)
1451 /* hzto() is removed from FreeBSD-3.0 */
1452 static int hzto(struct timeval *);
1453 
1454 static int
hzto(tv)1455 hzto(tv)
1456 	struct timeval *tv;
1457 {
1458 	struct timeval t2;
1459 
1460 	getmicrotime(&t2);
1461 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
1462 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
1463 	return (tvtohz(&t2));
1464 }
1465 #endif /* __FreeBSD_version > 300000 */
1466 
1467 /*
1468  * void
1469  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1470  *	delay action routine.  It is invoked via rmc_under_limit when the
1471  *	packet is discoverd to be overlimit.
1472  *
1473  *	If the delay action is result of borrow class being overlimit, then
1474  *	delay for the offtime of the borrowing class that is overlimit.
1475  *
1476  *	Returns: NONE
1477  */
1478 
1479 void
rmc_delay_action(struct rm_class * cl,struct rm_class * borrow)1480 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1481 {
1482 	int	delay, t, extradelay;
1483 
1484 	cl->stats_.overactions++;
1485 	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
1486 #ifndef BORROW_OFFTIME
1487 	delay += cl->offtime_;
1488 #endif
1489 
1490 	if (!cl->sleeping_) {
1491 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1492 #ifdef BORROW_OFFTIME
1493 		if (borrow != NULL)
1494 			extradelay = borrow->offtime_;
1495 		else
1496 #endif
1497 			extradelay = cl->offtime_;
1498 
1499 #ifdef ALTQ
1500 		/*
1501 		 * XXX recalculate suspend time:
1502 		 * current undertime is (tidle + pkt_time) calculated
1503 		 * from the last transmission.
1504 		 *	tidle: time required to bring avgidle back to 0
1505 		 *	pkt_time: target waiting time for this class
1506 		 * we need to replace pkt_time by offtime
1507 		 */
1508 		extradelay -= cl->last_pkttime_;
1509 #endif
1510 		if (extradelay > 0) {
1511 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1512 			delay += extradelay;
1513 		}
1514 
1515 		cl->sleeping_ = 1;
1516 		cl->stats_.delays++;
1517 
1518 		/*
1519 		 * Since packets are phased randomly with respect to the
1520 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1521 		 * short time so we have to wait for at least two ticks.
1522 		 * NOTE:  If there's no other traffic, we need the timer as
1523 		 * a 'backstop' to restart this class.
1524 		 */
1525 		if (delay > tick * 2) {
1526 			/* FreeBSD rounds up the tick */
1527 			t = hzto(&cl->undertime_);
1528 		} else
1529 			t = 2;
1530 		CALLOUT_RESET(&cl->callout_, t, rmc_restart, cl);
1531 	}
1532 }
1533 
1534 /*
1535  * void
1536  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1537  *	called by the system timer code & is responsible checking if the
1538  *	class is still sleeping (it might have been restarted as a side
1539  *	effect of the queue scan on a packet arrival) and, if so, restarting
1540  *	output for the class.  Inspecting the class state & restarting output
1541  *	require locking the class structure.  In general the driver is
1542  *	responsible for locking but this is the only routine that is not
1543  *	called directly or indirectly from the interface driver so it has
1544  *	know about system locking conventions.  Under bsd, locking is done
1545  *	by raising IPL to splimp so that's what's implemented here.  On a
1546  *	different system this would probably need to be changed.
1547  *
1548  *	Returns:	NONE
1549  */
1550 
1551 static void
rmc_restart(void * arg)1552 rmc_restart(void *arg)
1553 {
1554 	struct rm_class *cl = arg;
1555 	struct rm_ifdat	*ifd = cl->ifdat_;
1556 	struct epoch_tracker et;
1557 	int		 s;
1558 
1559 	s = splnet();
1560 	NET_EPOCH_ENTER(et);
1561 	IFQ_LOCK(ifd->ifq_);
1562 	CURVNET_SET(ifd->ifq_->altq_ifp->if_vnet);
1563 	if (cl->sleeping_) {
1564 		cl->sleeping_ = 0;
1565 		cl->undertime_.tv_sec = 0;
1566 
1567 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1568 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1569 			(ifd->restart)(ifd->ifq_);
1570 		}
1571 	}
1572 	CURVNET_RESTORE();
1573 	IFQ_UNLOCK(ifd->ifq_);
1574 	NET_EPOCH_EXIT(et);
1575 	splx(s);
1576 }
1577 
1578 /*
1579  * void
1580  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1581  *	handling routine for the root class of the link sharing structure.
1582  *
1583  *	Returns: NONE
1584  */
1585 
1586 static void
rmc_root_overlimit(struct rm_class * cl,struct rm_class * borrow)1587 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
1588 {
1589     panic("rmc_root_overlimit");
1590 }
1591 
1592 /*
1593  * Packet Queue handling routines.  Eventually, this is to localize the
1594  *	effects on the code whether queues are red queues or droptail
1595  *	queues.
1596  */
1597 
1598 static int
_rmc_addq(rm_class_t * cl,mbuf_t * m)1599 _rmc_addq(rm_class_t *cl, mbuf_t *m)
1600 {
1601 #ifdef ALTQ_RIO
1602 	if (q_is_rio(cl->q_))
1603 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1604 #endif
1605 #ifdef ALTQ_RED
1606 	if (q_is_red(cl->q_))
1607 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1608 #endif /* ALTQ_RED */
1609 #ifdef ALTQ_CODEL
1610 	if (q_is_codel(cl->q_))
1611 		return codel_addq(cl->codel_, cl->q_, m);
1612 #endif
1613 
1614 	if (cl->flags_ & RMCF_CLEARDSCP)
1615 		write_dsfield(m, cl->pktattr_, 0);
1616 
1617 	_addq(cl->q_, m);
1618 	return (0);
1619 }
1620 
1621 /* note: _rmc_dropq is not called for red */
1622 static void
_rmc_dropq(rm_class_t * cl)1623 _rmc_dropq(rm_class_t *cl)
1624 {
1625 	mbuf_t	*m;
1626 
1627 	if ((m = _getq(cl->q_)) != NULL)
1628 		m_freem(m);
1629 }
1630 
1631 static mbuf_t *
_rmc_getq(rm_class_t * cl)1632 _rmc_getq(rm_class_t *cl)
1633 {
1634 #ifdef ALTQ_RIO
1635 	if (q_is_rio(cl->q_))
1636 		return rio_getq((rio_t *)cl->red_, cl->q_);
1637 #endif
1638 #ifdef ALTQ_RED
1639 	if (q_is_red(cl->q_))
1640 		return red_getq(cl->red_, cl->q_);
1641 #endif
1642 #ifdef ALTQ_CODEL
1643 	if (q_is_codel(cl->q_))
1644 		return codel_getq(cl->codel_, cl->q_);
1645 #endif
1646 	return _getq(cl->q_);
1647 }
1648 
1649 static mbuf_t *
_rmc_pollq(rm_class_t * cl)1650 _rmc_pollq(rm_class_t *cl)
1651 {
1652 	return qhead(cl->q_);
1653 }
1654 
1655 #ifdef CBQ_TRACE
1656 
1657 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1658 struct cbqtrace		*cbqtrace_ptr = NULL;
1659 int			 cbqtrace_count;
1660 
1661 /*
1662  * DDB hook to trace cbq events:
1663  *  the last 1024 events are held in a circular buffer.
1664  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1665  */
1666 void cbqtrace_dump(int);
1667 static char *rmc_funcname(void *);
1668 
1669 static struct rmc_funcs {
1670 	void	*func;
1671 	char	*name;
1672 } rmc_funcs[] =
1673 {
1674 	rmc_init,		"rmc_init",
1675 	rmc_queue_packet,	"rmc_queue_packet",
1676 	rmc_under_limit,	"rmc_under_limit",
1677 	rmc_update_class_util,	"rmc_update_class_util",
1678 	rmc_delay_action,	"rmc_delay_action",
1679 	rmc_restart,		"rmc_restart",
1680 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1681 	NULL,			NULL
1682 };
1683 
rmc_funcname(void * func)1684 static char *rmc_funcname(void *func)
1685 {
1686 	struct rmc_funcs *fp;
1687 
1688 	for (fp = rmc_funcs; fp->func != NULL; fp++)
1689 		if (fp->func == func)
1690 			return (fp->name);
1691 	return ("unknown");
1692 }
1693 
cbqtrace_dump(int counter)1694 void cbqtrace_dump(int counter)
1695 {
1696 	int	 i, *p;
1697 	char	*cp;
1698 
1699 	counter = counter % NCBQTRACE;
1700 	p = (int *)&cbqtrace_buffer[counter];
1701 
1702 	for (i=0; i<20; i++) {
1703 		printf("[0x%x] ", *p++);
1704 		printf("%s: ", rmc_funcname((void *)*p++));
1705 		cp = (char *)p++;
1706 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1707 		printf("%d\n",*p++);
1708 
1709 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1710 			p = (int *)cbqtrace_buffer;
1711 	}
1712 }
1713 #endif /* CBQ_TRACE */
1714 #endif /* ALTQ_CBQ */
1715 
1716 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \
1717     defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL)
1718 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1719 
1720 void
_addq(class_queue_t * q,mbuf_t * m)1721 _addq(class_queue_t *q, mbuf_t *m)
1722 {
1723         mbuf_t	*m0;
1724 
1725 	if ((m0 = qtail(q)) != NULL)
1726 		m->m_nextpkt = m0->m_nextpkt;
1727 	else
1728 		m0 = m;
1729 	m0->m_nextpkt = m;
1730 	qtail(q) = m;
1731 	qlen(q)++;
1732 }
1733 
1734 mbuf_t *
_getq(class_queue_t * q)1735 _getq(class_queue_t *q)
1736 {
1737 	mbuf_t	*m, *m0;
1738 
1739 	if ((m = qtail(q)) == NULL)
1740 		return (NULL);
1741 	if ((m0 = m->m_nextpkt) != m)
1742 		m->m_nextpkt = m0->m_nextpkt;
1743 	else {
1744 		ASSERT(qlen(q) == 1);
1745 		qtail(q) = NULL;
1746 	}
1747 	qlen(q)--;
1748 	m0->m_nextpkt = NULL;
1749 	return (m0);
1750 }
1751 
1752 /* drop a packet at the tail of the queue */
1753 mbuf_t *
_getq_tail(class_queue_t * q)1754 _getq_tail(class_queue_t *q)
1755 {
1756 	mbuf_t	*m, *m0, *prev;
1757 
1758 	if ((m = m0 = qtail(q)) == NULL)
1759 		return NULL;
1760 	do {
1761 		prev = m0;
1762 		m0 = m0->m_nextpkt;
1763 	} while (m0 != m);
1764 	prev->m_nextpkt = m->m_nextpkt;
1765 	if (prev == m)  {
1766 		ASSERT(qlen(q) == 1);
1767 		qtail(q) = NULL;
1768 	} else
1769 		qtail(q) = prev;
1770 	qlen(q)--;
1771 	m->m_nextpkt = NULL;
1772 	return (m);
1773 }
1774 
1775 /* randomly select a packet in the queue */
1776 mbuf_t *
_getq_random(class_queue_t * q)1777 _getq_random(class_queue_t *q)
1778 {
1779 	struct mbuf	*m;
1780 	int		 i, n;
1781 
1782 	if ((m = qtail(q)) == NULL)
1783 		return NULL;
1784 	if (m->m_nextpkt == m) {
1785 		ASSERT(qlen(q) == 1);
1786 		qtail(q) = NULL;
1787 	} else {
1788 		struct mbuf *prev = NULL;
1789 
1790 		n = arc4random() % qlen(q) + 1;
1791 		for (i = 0; i < n; i++) {
1792 			prev = m;
1793 			m = m->m_nextpkt;
1794 		}
1795 		prev->m_nextpkt = m->m_nextpkt;
1796 		if (m == qtail(q))
1797 			qtail(q) = prev;
1798 	}
1799 	qlen(q)--;
1800 	m->m_nextpkt = NULL;
1801 	return (m);
1802 }
1803 
1804 void
_removeq(class_queue_t * q,mbuf_t * m)1805 _removeq(class_queue_t *q, mbuf_t *m)
1806 {
1807 	mbuf_t	*m0, *prev;
1808 
1809 	m0 = qtail(q);
1810 	do {
1811 		prev = m0;
1812 		m0 = m0->m_nextpkt;
1813 	} while (m0 != m);
1814 	prev->m_nextpkt = m->m_nextpkt;
1815 	if (prev == m)
1816 		qtail(q) = NULL;
1817 	else if (qtail(q) == m)
1818 		qtail(q) = prev;
1819 	qlen(q)--;
1820 }
1821 
1822 void
_flushq(class_queue_t * q)1823 _flushq(class_queue_t *q)
1824 {
1825 	mbuf_t *m;
1826 
1827 	while ((m = _getq(q)) != NULL)
1828 		m_freem(m);
1829 	ASSERT(qlen(q) == 0);
1830 }
1831 
1832 #endif /* !__GNUC__ || ALTQ_DEBUG */
1833 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1834