1 /*	$OpenBSD: altq_rmclass.c,v 1.11 2003/10/17 19:13:01 henning Exp $	*/
2 /*	$KAME: altq_rmclass.c,v 1.10 2001/02/09 07:20:40 kjc Exp $	*/
3 
4 /*
5  * Copyright (c) 1991-1997 Regents of the University of California.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by the Network Research
19  *      Group at Lawrence Berkeley Laboratory.
20  * 4. Neither the name of the University nor of the Laboratory may be used
21  *    to endorse or promote products derived from this software without
22  *    specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * LBL code modified by speer@eng.sun.com, May 1977.
37  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
38  */
39 
40 #include <sys/param.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/systm.h>
45 #include <sys/errno.h>
46 #include <sys/time.h>
47 
48 #include <net/if.h>
49 
50 #include <altq/altq.h>
51 #include <altq/altq_rmclass.h>
52 #include <altq/altq_rmclass_debug.h>
53 #include <altq/altq_red.h>
54 #include <altq/altq_rio.h>
55 
56 /*
57  * Local Macros
58  */
59 
60 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
61 
62 /*
63  * Local routines.
64  */
65 
66 static int	 rmc_satisfied(struct rm_class *, struct timeval *);
67 static void	 rmc_wrr_set_weights(struct rm_ifdat *);
68 static void	 rmc_depth_compute(struct rm_class *);
69 static void	 rmc_depth_recompute(rm_class_t *);
70 
71 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
72 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
73 
74 static int	 _rmc_addq(rm_class_t *, mbuf_t *);
75 static void	 _rmc_dropq(rm_class_t *);
76 static mbuf_t	*_rmc_getq(rm_class_t *);
77 static mbuf_t	*_rmc_pollq(rm_class_t *);
78 
79 static int	 rmc_under_limit(struct rm_class *, struct timeval *);
80 static void	 rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
81 static void	 rmc_drop_action(struct rm_class *);
82 static void	 rmc_restart(struct rm_class *);
83 static void	 rmc_root_overlimit(struct rm_class *, struct rm_class *);
84 
85 #define	BORROW_OFFTIME
86 /*
87  * BORROW_OFFTIME (experimental):
88  * borrow the offtime of the class borrowing from.
89  * the reason is that when its own offtime is set, the class is unable
90  * to borrow much, especially when cutoff is taking effect.
91  * but when the borrowed class is overloaded (advidle is close to minidle),
92  * use the borrowing class's offtime to avoid overload.
93  */
94 #define	ADJUST_CUTOFF
95 /*
96  * ADJUST_CUTOFF (experimental):
97  * if no underlimit class is found due to cutoff, increase cutoff and
98  * retry the scheduling loop.
99  * also, don't invoke delay_actions while cutoff is taking effect,
100  * since a sleeping class won't have a chance to be scheduled in the
101  * next loop.
102  *
103  * now heuristics for setting the top-level variable (cutoff_) becomes:
104  *	1. if a packet arrives for a not-overlimit class, set cutoff
105  *	   to the depth of the class.
106  *	2. if cutoff is i, and a packet arrives for an overlimit class
107  *	   with an underlimit ancestor at a lower level than i (say j),
108  *	   then set cutoff to j.
109  *	3. at scheduling a packet, if there is no underlimit class
110  *	   due to the current cutoff level, increase cutoff by 1 and
111  *	   then try to schedule again.
112  */
113 
114 /*
115  * rm_class_t *
116  * rmc_newclass(...) - Create a new resource management class at priority
117  * 'pri' on the interface given by 'ifd'.
118  *
119  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
120  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
121  *              than 100% of the bandwidth, this number should be the
122  *              'effective' rate for the class.  Let f be the
123  *              bandwidth fraction allocated to this class, and let
124  *              nsPerByte be the data rate of the output link in
125  *              nanoseconds/byte.  Then nsecPerByte is set to
126  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
127  *              for a class that gets 50% of an ethernet's bandwidth.
128  *
129  * action       the routine to call when the class is over limit.
130  *
131  * maxq         max allowable queue size for class (in packets).
132  *
133  * parent       parent class pointer.
134  *
135  * borrow       class to borrow from (should be either 'parent' or null).
136  *
137  * maxidle      max value allowed for class 'idle' time estimate (this
138  *              parameter determines how large an initial burst of packets
139  *              can be before overlimit action is invoked.
140  *
141  * offtime      how long 'delay' action will delay when class goes over
142  *              limit (this parameter determines the steady-state burst
143  *              size when a class is running over its limit).
144  *
145  * Maxidle and offtime have to be computed from the following:  If the
146  * average packet size is s, the bandwidth fraction allocated to this
147  * class is f, we want to allow b packet bursts, and the gain of the
148  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
149  *
150  *   ptime = s * nsPerByte * (1 - f) / f
151  *   maxidle = ptime * (1 - g^b) / g^b
152  *   minidle = -ptime * (1 / (f - 1))
153  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
154  *
155  * Operationally, it's convenient to specify maxidle & offtime in units
156  * independent of the link bandwidth so the maxidle & offtime passed to
157  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
158  * (The constant factor is a scale factor needed to make the parameters
159  * integers.  This scaling also means that the 'unscaled' values of
160  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
161  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
162  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
163  * maxidle also must be scaled upward by this value.  Thus, the passed
164  * values for maxidle and offtime can be computed as follows:
165  *
166  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
167  * offtime = offtime * 8 / (1000 * nsecPerByte)
168  *
169  * When USE_HRTIME is employed, then maxidle and offtime become:
170  *	maxidle = maxilde * (8.0 / nsecPerByte);
171  *	offtime = offtime * (8.0 / nsecPerByte);
172  */
173 
174 struct rm_class *
rmc_newclass(int pri,struct rm_ifdat * ifd,u_int nsecPerByte,void (* action)(rm_class_t *,rm_class_t *),int maxq,struct rm_class * parent,struct rm_class * borrow,u_int maxidle,int minidle,u_int offtime,int pktsize,int flags)175 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
176     void (*action)(rm_class_t *, rm_class_t *), int maxq,
177     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
178     int minidle, u_int offtime, int pktsize, int flags)
179 {
180 	struct rm_class	*cl;
181 	struct rm_class	*peer;
182 	int		 s;
183 
184 	if (pri >= RM_MAXPRIO)
185 		return (NULL);
186 #ifndef ALTQ_RED
187 	if (flags & RMCF_RED) {
188 #ifdef ALTQ_DEBUG
189 		printf("rmc_newclass: RED not configured for CBQ!\n");
190 #endif
191 		return (NULL);
192 	}
193 #endif
194 #ifndef ALTQ_RIO
195 	if (flags & RMCF_RIO) {
196 #ifdef ALTQ_DEBUG
197 		printf("rmc_newclass: RIO not configured for CBQ!\n");
198 #endif
199 		return (NULL);
200 	}
201 #endif
202 
203 	MALLOC(cl, struct rm_class *, sizeof(struct rm_class),
204 	       M_DEVBUF, M_WAITOK);
205 	if (cl == NULL)
206 		return (NULL);
207 	bzero(cl, sizeof(struct rm_class));
208 	CALLOUT_INIT(&cl->callout_);
209 	MALLOC(cl->q_, class_queue_t *, sizeof(class_queue_t),
210 	       M_DEVBUF, M_WAITOK);
211 	if (cl->q_ == NULL) {
212 		FREE(cl, M_DEVBUF);
213 		return (NULL);
214 	}
215 	bzero(cl->q_, sizeof(class_queue_t));
216 
217 	/*
218 	 * Class initialization.
219 	 */
220 	cl->children_ = NULL;
221 	cl->parent_ = parent;
222 	cl->borrow_ = borrow;
223 	cl->leaf_ = 1;
224 	cl->ifdat_ = ifd;
225 	cl->pri_ = pri;
226 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
227 	cl->depth_ = 0;
228 	cl->qthresh_ = 0;
229 	cl->ns_per_byte_ = nsecPerByte;
230 
231 	qlimit(cl->q_) = maxq;
232 	qtype(cl->q_) = Q_DROPHEAD;
233 	qlen(cl->q_) = 0;
234 	cl->flags_ = flags;
235 
236 #if 1 /* minidle is also scaled in ALTQ */
237 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
238 	if (cl->minidle_ > 0)
239 		cl->minidle_ = 0;
240 #else
241 	cl->minidle_ = minidle;
242 #endif
243 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
244 	if (cl->maxidle_ == 0)
245 		cl->maxidle_ = 1;
246 #if 1 /* offtime is also scaled in ALTQ */
247 	cl->avgidle_ = cl->maxidle_;
248 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
249 	if (cl->offtime_ == 0)
250 		cl->offtime_ = 1;
251 #else
252 	cl->avgidle_ = 0;
253 	cl->offtime_ = (offtime * nsecPerByte) / 8;
254 #endif
255 	cl->overlimit = action;
256 
257 #ifdef ALTQ_RED
258 	if (flags & (RMCF_RED|RMCF_RIO)) {
259 		int red_flags, red_pkttime;
260 
261 		red_flags = 0;
262 		if (flags & RMCF_ECN)
263 			red_flags |= REDF_ECN;
264 		if (flags & RMCF_FLOWVALVE)
265 			red_flags |= REDF_FLOWVALVE;
266 #ifdef ALTQ_RIO
267 		if (flags & RMCF_CLEARDSCP)
268 			red_flags |= RIOF_CLEARDSCP;
269 #endif
270 		red_pkttime = nsecPerByte * pktsize  / 1000;
271 
272 		if (flags & RMCF_RED) {
273 			cl->red_ = red_alloc(0, 0,
274 			    qlimit(cl->q_) * 10/100,
275 			    qlimit(cl->q_) * 30/100,
276 			    red_flags, red_pkttime);
277 			if (cl->red_ != NULL)
278 				qtype(cl->q_) = Q_RED;
279 		}
280 #ifdef ALTQ_RIO
281 		else {
282 			cl->red_ = (red_t *)rio_alloc(0, NULL,
283 						      red_flags, red_pkttime);
284 			if (cl->red_ != NULL)
285 				qtype(cl->q_) = Q_RIO;
286 		}
287 #endif
288 	}
289 #endif /* ALTQ_RED */
290 
291 	/*
292 	 * put the class into the class tree
293 	 */
294 	s = splimp();
295 	if ((peer = ifd->active_[pri]) != NULL) {
296 		/* find the last class at this pri */
297 		cl->peer_ = peer;
298 		while (peer->peer_ != ifd->active_[pri])
299 			peer = peer->peer_;
300 		peer->peer_ = cl;
301 	} else {
302 		ifd->active_[pri] = cl;
303 		cl->peer_ = cl;
304 	}
305 
306 	if (cl->parent_) {
307 		cl->next_ = parent->children_;
308 		parent->children_ = cl;
309 		parent->leaf_ = 0;
310 	}
311 
312 	/*
313 	 * Compute the depth of this class and its ancestors in the class
314 	 * hierarchy.
315 	 */
316 	rmc_depth_compute(cl);
317 
318 	/*
319 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
320 	 */
321 	if (ifd->wrr_) {
322 		ifd->num_[pri]++;
323 		ifd->alloc_[pri] += cl->allotment_;
324 		rmc_wrr_set_weights(ifd);
325 	}
326 	splx(s);
327 	return (cl);
328 }
329 
330 int
rmc_modclass(struct rm_class * cl,u_int nsecPerByte,int maxq,u_int maxidle,int minidle,u_int offtime,int pktsize)331 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
332     int minidle, u_int offtime, int pktsize)
333 {
334 	struct rm_ifdat	*ifd;
335 	u_int		 old_allotment;
336 	int		 s;
337 
338 	ifd = cl->ifdat_;
339 	old_allotment = cl->allotment_;
340 
341 	s = splimp();
342 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
343 	cl->qthresh_ = 0;
344 	cl->ns_per_byte_ = nsecPerByte;
345 
346 	qlimit(cl->q_) = maxq;
347 
348 #if 1 /* minidle is also scaled in ALTQ */
349 	cl->minidle_ = (minidle * nsecPerByte) / 8;
350 	if (cl->minidle_ > 0)
351 		cl->minidle_ = 0;
352 #else
353 	cl->minidle_ = minidle;
354 #endif
355 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
356 	if (cl->maxidle_ == 0)
357 		cl->maxidle_ = 1;
358 #if 1 /* offtime is also scaled in ALTQ */
359 	cl->avgidle_ = cl->maxidle_;
360 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
361 	if (cl->offtime_ == 0)
362 		cl->offtime_ = 1;
363 #else
364 	cl->avgidle_ = 0;
365 	cl->offtime_ = (offtime * nsecPerByte) / 8;
366 #endif
367 
368 	/*
369 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
370 	 */
371 	if (ifd->wrr_) {
372 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
373 		rmc_wrr_set_weights(ifd);
374 	}
375 	splx(s);
376 	return (0);
377 }
378 
379 /*
380  * static void
381  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
382  *	the appropriate run robin weights for the CBQ weighted round robin
383  *	algorithm.
384  *
385  *	Returns: NONE
386  */
387 
388 static void
rmc_wrr_set_weights(struct rm_ifdat * ifd)389 rmc_wrr_set_weights(struct rm_ifdat *ifd)
390 {
391 	int		i;
392 	struct rm_class	*cl, *clh;
393 
394 	for (i = 0; i < RM_MAXPRIO; i++) {
395 		/*
396 		 * This is inverted from that of the simulator to
397 		 * maintain precision.
398 		 */
399 		if (ifd->num_[i] == 0)
400 			ifd->M_[i] = 0;
401 		else
402 			ifd->M_[i] = ifd->alloc_[i] /
403 				(ifd->num_[i] * ifd->maxpkt_);
404 		/*
405 		 * Compute the weighted allotment for each class.
406 		 * This takes the expensive div instruction out
407 		 * of the main loop for the wrr scheduling path.
408 		 * These only get recomputed when a class comes or
409 		 * goes.
410 		 */
411 		if (ifd->active_[i] != NULL) {
412 			clh = cl = ifd->active_[i];
413 			do {
414 				/* safe-guard for slow link or alloc_ == 0 */
415 				if (ifd->M_[i] == 0)
416 					cl->w_allotment_ = 0;
417 				else
418 					cl->w_allotment_ = cl->allotment_ /
419 						ifd->M_[i];
420 				cl = cl->peer_;
421 			} while ((cl != NULL) && (cl != clh));
422 		}
423 	}
424 }
425 
426 int
rmc_get_weight(struct rm_ifdat * ifd,int pri)427 rmc_get_weight(struct rm_ifdat *ifd, int pri)
428 {
429 	if ((pri >= 0) && (pri < RM_MAXPRIO))
430 		return (ifd->M_[pri]);
431 	else
432 		return (0);
433 }
434 
435 /*
436  * static void
437  * rmc_depth_compute(struct rm_class *cl) - This function computes the
438  *	appropriate depth of class 'cl' and its ancestors.
439  *
440  *	Returns:	NONE
441  */
442 
443 static void
rmc_depth_compute(struct rm_class * cl)444 rmc_depth_compute(struct rm_class *cl)
445 {
446 	rm_class_t	*t = cl, *p;
447 
448 	/*
449 	 * Recompute the depth for the branch of the tree.
450 	 */
451 	while (t != NULL) {
452 		p = t->parent_;
453 		if (p && (t->depth_ >= p->depth_)) {
454 			p->depth_ = t->depth_ + 1;
455 			t = p;
456 		} else
457 			t = NULL;
458 	}
459 }
460 
461 /*
462  * static void
463  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
464  *	the depth of the tree after a class has been deleted.
465  *
466  *	Returns:	NONE
467  */
468 
469 static void
rmc_depth_recompute(rm_class_t * cl)470 rmc_depth_recompute(rm_class_t *cl)
471 {
472 #if 1 /* ALTQ */
473 	rm_class_t	*p, *t;
474 
475 	p = cl;
476 	while (p != NULL) {
477 		if ((t = p->children_) == NULL) {
478 			p->depth_ = 0;
479 		} else {
480 			int cdepth = 0;
481 
482 			while (t != NULL) {
483 				if (t->depth_ > cdepth)
484 					cdepth = t->depth_;
485 				t = t->next_;
486 			}
487 
488 			if (p->depth_ == cdepth + 1)
489 				/* no change to this parent */
490 				return;
491 
492 			p->depth_ = cdepth + 1;
493 		}
494 
495 		p = p->parent_;
496 	}
497 #else
498 	rm_class_t	*t;
499 
500 	if (cl->depth_ >= 1) {
501 		if (cl->children_ == NULL) {
502 			cl->depth_ = 0;
503 		} else if ((t = cl->children_) != NULL) {
504 			while (t != NULL) {
505 				if (t->children_ != NULL)
506 					rmc_depth_recompute(t);
507 				t = t->next_;
508 			}
509 		} else
510 			rmc_depth_compute(cl);
511 	}
512 #endif
513 }
514 
515 /*
516  * void
517  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
518  *	function deletes a class from the link-sharing structure and frees
519  *	all resources associated with the class.
520  *
521  *	Returns: NONE
522  */
523 
524 void
rmc_delete_class(struct rm_ifdat * ifd,struct rm_class * cl)525 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
526 {
527 	struct rm_class	*p, *head, *previous;
528 	int		 s;
529 
530 	ASSERT(cl->children_ == NULL);
531 
532 	if (cl->sleeping_)
533 		CALLOUT_STOP(&cl->callout_);
534 
535 	s = splimp();
536 	/*
537 	 * Free packets in the packet queue.
538 	 * XXX - this may not be a desired behavior.  Packets should be
539 	 *		re-queued.
540 	 */
541 	rmc_dropall(cl);
542 
543 	/*
544 	 * If the class has a parent, then remove the class from the
545 	 * class from the parent's children chain.
546 	 */
547 	if (cl->parent_ != NULL) {
548 		head = cl->parent_->children_;
549 		p = previous = head;
550 		if (head->next_ == NULL) {
551 			ASSERT(head == cl);
552 			cl->parent_->children_ = NULL;
553 			cl->parent_->leaf_ = 1;
554 		} else while (p != NULL) {
555 			if (p == cl) {
556 				if (cl == head)
557 					cl->parent_->children_ = cl->next_;
558 				else
559 					previous->next_ = cl->next_;
560 				cl->next_ = NULL;
561 				p = NULL;
562 			} else {
563 				previous = p;
564 				p = p->next_;
565 			}
566 		}
567 	}
568 
569 	/*
570 	 * Delete class from class priority peer list.
571 	 */
572 	if ((p = ifd->active_[cl->pri_]) != NULL) {
573 		/*
574 		 * If there is more than one member of this priority
575 		 * level, then look for class(cl) in the priority level.
576 		 */
577 		if (p != p->peer_) {
578 			while (p->peer_ != cl)
579 				p = p->peer_;
580 			p->peer_ = cl->peer_;
581 
582 			if (ifd->active_[cl->pri_] == cl)
583 				ifd->active_[cl->pri_] = cl->peer_;
584 		} else {
585 			ASSERT(p == cl);
586 			ifd->active_[cl->pri_] = NULL;
587 		}
588 	}
589 
590 	/*
591 	 * Recompute the WRR weights.
592 	 */
593 	if (ifd->wrr_) {
594 		ifd->alloc_[cl->pri_] -= cl->allotment_;
595 		ifd->num_[cl->pri_]--;
596 		rmc_wrr_set_weights(ifd);
597 	}
598 
599 	/*
600 	 * Re-compute the depth of the tree.
601 	 */
602 #if 1 /* ALTQ */
603 	rmc_depth_recompute(cl->parent_);
604 #else
605 	rmc_depth_recompute(ifd->root_);
606 #endif
607 
608 	splx(s);
609 
610 	/*
611 	 * Free the class structure.
612 	 */
613 	if (cl->red_ != NULL) {
614 #ifdef ALTQ_RIO
615 		if (q_is_rio(cl->q_))
616 			rio_destroy((rio_t *)cl->red_);
617 #endif
618 #ifdef ALTQ_RED
619 		if (q_is_red(cl->q_))
620 			red_destroy(cl->red_);
621 #endif
622 	}
623 	FREE(cl->q_, M_DEVBUF);
624 	FREE(cl, M_DEVBUF);
625 }
626 
627 
628 /*
629  * void
630  * rmc_init(...) - Initialize the resource management data structures
631  *	associated with the output portion of interface 'ifp'.  'ifd' is
632  *	where the structures will be built (for backwards compatibility, the
633  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
634  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
635  *	'restart' is the driver-specific routine that the generic 'delay
636  *	until under limit' action will call to restart output.  `maxq'
637  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
638  *	is the maximum number of packets that the resource management
639  *	code will allow to be queued 'downstream' (this is typically 1).
640  *
641  *	Returns:	NONE
642  */
643 
644 void
rmc_init(struct ifaltq * ifq,struct rm_ifdat * ifd,u_int nsecPerByte,void (* restart)(struct ifaltq *),int maxq,int maxqueued,u_int maxidle,int minidle,u_int offtime,int flags)645 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
646     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
647     int minidle, u_int offtime, int flags)
648 {
649 	int		i, mtu;
650 
651 	/*
652 	 * Initialize the CBQ tracing/debug facility.
653 	 */
654 	CBQTRACEINIT();
655 
656 	bzero((char *)ifd, sizeof (*ifd));
657 	mtu = ifq->altq_ifp->if_mtu;
658 	ifd->ifq_ = ifq;
659 	ifd->restart = restart;
660 	ifd->maxqueued_ = maxqueued;
661 	ifd->ns_per_byte_ = nsecPerByte;
662 	ifd->maxpkt_ = mtu;
663 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
664 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
665 #if 1
666 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
667 	if (mtu * nsecPerByte > 10 * 1000000)
668 		ifd->maxiftime_ /= 4;
669 #endif
670 
671 	reset_cutoff(ifd);
672 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
673 
674 	/*
675 	 * Initialize the CBQ's WRR state.
676 	 */
677 	for (i = 0; i < RM_MAXPRIO; i++) {
678 		ifd->alloc_[i] = 0;
679 		ifd->M_[i] = 0;
680 		ifd->num_[i] = 0;
681 		ifd->na_[i] = 0;
682 		ifd->active_[i] = NULL;
683 	}
684 
685 	/*
686 	 * Initialize current packet state.
687 	 */
688 	ifd->qi_ = 0;
689 	ifd->qo_ = 0;
690 	for (i = 0; i < RM_MAXQUEUED; i++) {
691 		ifd->class_[i] = NULL;
692 		ifd->curlen_[i] = 0;
693 		ifd->borrowed_[i] = NULL;
694 	}
695 
696 	/*
697 	 * Create the root class of the link-sharing structure.
698 	 */
699 	if ((ifd->root_ = rmc_newclass(0, ifd,
700 				       nsecPerByte,
701 				       rmc_root_overlimit, maxq, 0, 0,
702 				       maxidle, minidle, offtime,
703 				       0, 0)) == NULL) {
704 		printf("rmc_init: root class not allocated\n");
705 		return ;
706 	}
707 	ifd->root_->depth_ = 0;
708 }
709 
710 /*
711  * void
712  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
713  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
714  *	by a driver's if_output routine.  This routine must be called with
715  *	output packet completion interrupts locked out (to avoid racing with
716  *	rmc_dequeue_next).
717  *
718  *	Returns:	0 on successful queueing
719  *			-1 when packet drop occurs
720  */
721 int
rmc_queue_packet(struct rm_class * cl,mbuf_t * m)722 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
723 {
724 	struct timeval	 now;
725 	struct rm_ifdat *ifd = cl->ifdat_;
726 	int		 cpri = cl->pri_;
727 	int		 is_empty = qempty(cl->q_);
728 
729 	RM_GETTIME(now);
730 	if (ifd->cutoff_ > 0) {
731 		if (TV_LT(&cl->undertime_, &now)) {
732 			if (ifd->cutoff_ > cl->depth_)
733 				ifd->cutoff_ = cl->depth_;
734 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
735 		}
736 #if 1 /* ALTQ */
737 		else {
738 			/*
739 			 * the class is overlimit. if the class has
740 			 * underlimit ancestors, set cutoff to the lowest
741 			 * depth among them.
742 			 */
743 			struct rm_class *borrow = cl->borrow_;
744 
745 			while (borrow != NULL &&
746 			       borrow->depth_ < ifd->cutoff_) {
747 				if (TV_LT(&borrow->undertime_, &now)) {
748 					ifd->cutoff_ = borrow->depth_;
749 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
750 					break;
751 				}
752 				borrow = borrow->borrow_;
753 			}
754 		}
755 #else /* !ALTQ */
756 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
757 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
758 				ifd->cutoff_ = cl->borrow_->depth_;
759 				CBQTRACE(rmc_queue_packet, 'ffob',
760 					 cl->borrow_->depth_);
761 			}
762 		}
763 #endif /* !ALTQ */
764 	}
765 
766 	if (_rmc_addq(cl, m) < 0)
767 		/* failed */
768 		return (-1);
769 
770 	if (is_empty) {
771 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
772 		ifd->na_[cpri]++;
773 	}
774 
775 	if (qlen(cl->q_) > qlimit(cl->q_)) {
776 		/* note: qlimit can be set to 0 or 1 */
777 		rmc_drop_action(cl);
778 		return (-1);
779 	}
780 	return (0);
781 }
782 
783 /*
784  * void
785  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
786  *	classes to see if they are satisfied.
787  */
788 
789 static void
rmc_tl_satisfied(struct rm_ifdat * ifd,struct timeval * now)790 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
791 {
792 	int		 i;
793 	rm_class_t	*p, *bp;
794 
795 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
796 		if ((bp = ifd->active_[i]) != NULL) {
797 			p = bp;
798 			do {
799 				if (!rmc_satisfied(p, now)) {
800 					ifd->cutoff_ = p->depth_;
801 					return;
802 				}
803 				p = p->peer_;
804 			} while (p != bp);
805 		}
806 	}
807 
808 	reset_cutoff(ifd);
809 }
810 
811 /*
812  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
813  */
814 
815 static int
rmc_satisfied(struct rm_class * cl,struct timeval * now)816 rmc_satisfied(struct rm_class *cl, struct timeval *now)
817 {
818 	rm_class_t	*p;
819 
820 	if (cl == NULL)
821 		return (1);
822 	if (TV_LT(now, &cl->undertime_))
823 		return (1);
824 	if (cl->depth_ == 0) {
825 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
826 			return (0);
827 		else
828 			return (1);
829 	}
830 	if (cl->children_ != NULL) {
831 		p = cl->children_;
832 		while (p != NULL) {
833 			if (!rmc_satisfied(p, now))
834 				return (0);
835 			p = p->next_;
836 		}
837 	}
838 
839 	return (1);
840 }
841 
842 /*
843  * Return 1 if class 'cl' is under limit or can borrow from a parent,
844  * 0 if overlimit.  As a side-effect, this routine will invoke the
845  * class overlimit action if the class if overlimit.
846  */
847 
848 static int
rmc_under_limit(struct rm_class * cl,struct timeval * now)849 rmc_under_limit(struct rm_class *cl, struct timeval *now)
850 {
851 	rm_class_t	*p = cl;
852 	rm_class_t	*top;
853 	struct rm_ifdat	*ifd = cl->ifdat_;
854 
855 	ifd->borrowed_[ifd->qi_] = NULL;
856 	/*
857 	 * If cl is the root class, then always return that it is
858 	 * underlimit.  Otherwise, check to see if the class is underlimit.
859 	 */
860 	if (cl->parent_ == NULL)
861 		return (1);
862 
863 	if (cl->sleeping_) {
864 		if (TV_LT(now, &cl->undertime_))
865 			return (0);
866 
867 		CALLOUT_STOP(&cl->callout_);
868 		cl->sleeping_ = 0;
869 		cl->undertime_.tv_sec = 0;
870 		return (1);
871 	}
872 
873 	top = NULL;
874 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
875 		if (((cl = cl->borrow_) == NULL) ||
876 		    (cl->depth_ > ifd->cutoff_)) {
877 #ifdef ADJUST_CUTOFF
878 			if (cl != NULL)
879 				/* cutoff is taking effect, just
880 				   return false without calling
881 				   the delay action. */
882 				return (0);
883 #endif
884 #ifdef BORROW_OFFTIME
885 			/*
886 			 * check if the class can borrow offtime too.
887 			 * borrow offtime from the top of the borrow
888 			 * chain if the top class is not overloaded.
889 			 */
890 			if (cl != NULL) {
891 				/* cutoff is taking effect, use this class as top. */
892 				top = cl;
893 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
894 			}
895 			if (top != NULL && top->avgidle_ == top->minidle_)
896 				top = NULL;
897 			p->overtime_ = *now;
898 			(p->overlimit)(p, top);
899 #else
900 			p->overtime_ = *now;
901 			(p->overlimit)(p, NULL);
902 #endif
903 			return (0);
904 		}
905 		top = cl;
906 	}
907 
908 	if (cl != p)
909 		ifd->borrowed_[ifd->qi_] = cl;
910 	return (1);
911 }
912 
913 /*
914  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
915  *	Packet-by-packet round robin.
916  *
917  * The heart of the weighted round-robin scheduler, which decides which
918  * class next gets to send a packet.  Highest priority first, then
919  * weighted round-robin within priorites.
920  *
921  * Each able-to-send class gets to send until its byte allocation is
922  * exhausted.  Thus, the active pointer is only changed after a class has
923  * exhausted its allocation.
924  *
925  * If the scheduler finds no class that is underlimit or able to borrow,
926  * then the first class found that had a nonzero queue and is allowed to
927  * borrow gets to send.
928  */
929 
930 static mbuf_t *
_rmc_wrr_dequeue_next(struct rm_ifdat * ifd,int op)931 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
932 {
933 	struct rm_class	*cl = NULL, *first = NULL;
934 	u_int		 deficit;
935 	int		 cpri;
936 	mbuf_t		*m;
937 	struct timeval	 now;
938 
939 	RM_GETTIME(now);
940 
941 	/*
942 	 * if the driver polls the top of the queue and then removes
943 	 * the polled packet, we must return the same packet.
944 	 */
945 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
946 		cl = ifd->pollcache_;
947 		cpri = cl->pri_;
948 		if (ifd->efficient_) {
949 			/* check if this class is overlimit */
950 			if (cl->undertime_.tv_sec != 0 &&
951 			    rmc_under_limit(cl, &now) == 0)
952 				first = cl;
953 		}
954 		ifd->pollcache_ = NULL;
955 		goto _wrr_out;
956 	}
957 	else {
958 		/* mode == ALTDQ_POLL || pollcache == NULL */
959 		ifd->pollcache_ = NULL;
960 		ifd->borrowed_[ifd->qi_] = NULL;
961 	}
962 #ifdef ADJUST_CUTOFF
963  _again:
964 #endif
965 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
966 		if (ifd->na_[cpri] == 0)
967 			continue;
968 		deficit = 0;
969 		/*
970 		 * Loop through twice for a priority level, if some class
971 		 * was unable to send a packet the first round because
972 		 * of the weighted round-robin mechanism.
973 		 * During the second loop at this level, deficit==2.
974 		 * (This second loop is not needed if for every class,
975 		 * "M[cl->pri_])" times "cl->allotment" is greater than
976 		 * the byte size for the largest packet in the class.)
977 		 */
978  _wrr_loop:
979 		cl = ifd->active_[cpri];
980 		ASSERT(cl != NULL);
981 		do {
982 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
983 				cl->bytes_alloc_ += cl->w_allotment_;
984 			if (!qempty(cl->q_)) {
985 				if ((cl->undertime_.tv_sec == 0) ||
986 				    rmc_under_limit(cl, &now)) {
987 					if (cl->bytes_alloc_ > 0 || deficit > 1)
988 						goto _wrr_out;
989 
990 					/* underlimit but no alloc */
991 					deficit = 1;
992 #if 1
993 					ifd->borrowed_[ifd->qi_] = NULL;
994 #endif
995 				}
996 				else if (first == NULL && cl->borrow_ != NULL)
997 					first = cl; /* borrowing candidate */
998 			}
999 
1000 			cl->bytes_alloc_ = 0;
1001 			cl = cl->peer_;
1002 		} while (cl != ifd->active_[cpri]);
1003 
1004 		if (deficit == 1) {
1005 			/* first loop found an underlimit class with deficit */
1006 			/* Loop on same priority level, with new deficit.  */
1007 			deficit = 2;
1008 			goto _wrr_loop;
1009 		}
1010 	}
1011 
1012 #ifdef ADJUST_CUTOFF
1013 	/*
1014 	 * no underlimit class found.  if cutoff is taking effect,
1015 	 * increase cutoff and try again.
1016 	 */
1017 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1018 		ifd->cutoff_++;
1019 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1020 		goto _again;
1021 	}
1022 #endif /* ADJUST_CUTOFF */
1023 	/*
1024 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1025 	 * class we encounter will send a packet if all the classes
1026 	 * of the link-sharing structure are overlimit.
1027 	 */
1028 	reset_cutoff(ifd);
1029 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1030 
1031 	if (!ifd->efficient_ || first == NULL)
1032 		return (NULL);
1033 
1034 	cl = first;
1035 	cpri = cl->pri_;
1036 #if 0	/* too time-consuming for nothing */
1037 	if (cl->sleeping_)
1038 		CALLOUT_STOP(&cl->callout_);
1039 	cl->sleeping_ = 0;
1040 	cl->undertime_.tv_sec = 0;
1041 #endif
1042 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1043 	ifd->cutoff_ = cl->borrow_->depth_;
1044 
1045 	/*
1046 	 * Deque the packet and do the book keeping...
1047 	 */
1048  _wrr_out:
1049 	if (op == ALTDQ_REMOVE) {
1050 		m = _rmc_getq(cl);
1051 		if (m == NULL)
1052 			panic("_rmc_wrr_dequeue_next");
1053 		if (qempty(cl->q_))
1054 			ifd->na_[cpri]--;
1055 
1056 		/*
1057 		 * Update class statistics and link data.
1058 		 */
1059 		if (cl->bytes_alloc_ > 0)
1060 			cl->bytes_alloc_ -= m_pktlen(m);
1061 
1062 		if ((cl->bytes_alloc_ <= 0) || first == cl)
1063 			ifd->active_[cl->pri_] = cl->peer_;
1064 		else
1065 			ifd->active_[cl->pri_] = cl;
1066 
1067 		ifd->class_[ifd->qi_] = cl;
1068 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1069 		ifd->now_[ifd->qi_] = now;
1070 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1071 		ifd->queued_++;
1072 	} else {
1073 		/* mode == ALTDQ_PPOLL */
1074 		m = _rmc_pollq(cl);
1075 		ifd->pollcache_ = cl;
1076 	}
1077 	return (m);
1078 }
1079 
1080 /*
1081  * Dequeue & return next packet from the highest priority class that
1082  * has a packet to send & has enough allocation to send it.  This
1083  * routine is called by a driver whenever it needs a new packet to
1084  * output.
1085  */
1086 static mbuf_t *
_rmc_prr_dequeue_next(struct rm_ifdat * ifd,int op)1087 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1088 {
1089 	mbuf_t		*m;
1090 	int		 cpri;
1091 	struct rm_class	*cl, *first = NULL;
1092 	struct timeval	 now;
1093 
1094 	RM_GETTIME(now);
1095 
1096 	/*
1097 	 * if the driver polls the top of the queue and then removes
1098 	 * the polled packet, we must return the same packet.
1099 	 */
1100 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1101 		cl = ifd->pollcache_;
1102 		cpri = cl->pri_;
1103 		ifd->pollcache_ = NULL;
1104 		goto _prr_out;
1105 	} else {
1106 		/* mode == ALTDQ_POLL || pollcache == NULL */
1107 		ifd->pollcache_ = NULL;
1108 		ifd->borrowed_[ifd->qi_] = NULL;
1109 	}
1110 #ifdef ADJUST_CUTOFF
1111  _again:
1112 #endif
1113 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1114 		if (ifd->na_[cpri] == 0)
1115 			continue;
1116 		cl = ifd->active_[cpri];
1117 		ASSERT(cl != NULL);
1118 		do {
1119 			if (!qempty(cl->q_)) {
1120 				if ((cl->undertime_.tv_sec == 0) ||
1121 				    rmc_under_limit(cl, &now))
1122 					goto _prr_out;
1123 				if (first == NULL && cl->borrow_ != NULL)
1124 					first = cl;
1125 			}
1126 			cl = cl->peer_;
1127 		} while (cl != ifd->active_[cpri]);
1128 	}
1129 
1130 #ifdef ADJUST_CUTOFF
1131 	/*
1132 	 * no underlimit class found.  if cutoff is taking effect, increase
1133 	 * cutoff and try again.
1134 	 */
1135 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1136 		ifd->cutoff_++;
1137 		goto _again;
1138 	}
1139 #endif /* ADJUST_CUTOFF */
1140 	/*
1141 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1142 	 * class we encounter will send a packet if all the classes
1143 	 * of the link-sharing structure are overlimit.
1144 	 */
1145 	reset_cutoff(ifd);
1146 	if (!ifd->efficient_ || first == NULL)
1147 		return (NULL);
1148 
1149 	cl = first;
1150 	cpri = cl->pri_;
1151 #if 0	/* too time-consuming for nothing */
1152 	if (cl->sleeping_)
1153 		CALLOUT_STOP(&cl->callout_);
1154 	cl->sleeping_ = 0;
1155 	cl->undertime_.tv_sec = 0;
1156 #endif
1157 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1158 	ifd->cutoff_ = cl->borrow_->depth_;
1159 
1160 	/*
1161 	 * Deque the packet and do the book keeping...
1162 	 */
1163  _prr_out:
1164 	if (op == ALTDQ_REMOVE) {
1165 		m = _rmc_getq(cl);
1166 		if (m == NULL)
1167 			panic("_rmc_prr_dequeue_next");
1168 		if (qempty(cl->q_))
1169 			ifd->na_[cpri]--;
1170 
1171 		ifd->active_[cpri] = cl->peer_;
1172 
1173 		ifd->class_[ifd->qi_] = cl;
1174 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1175 		ifd->now_[ifd->qi_] = now;
1176 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1177 		ifd->queued_++;
1178 	} else {
1179 		/* mode == ALTDQ_POLL */
1180 		m = _rmc_pollq(cl);
1181 		ifd->pollcache_ = cl;
1182 	}
1183 	return (m);
1184 }
1185 
1186 /*
1187  * mbuf_t *
1188  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1189  *	is invoked by the packet driver to get the next packet to be
1190  *	dequeued and output on the link.  If WRR is enabled, then the
1191  *	WRR dequeue next routine will determine the next packet to sent.
1192  *	Otherwise, packet-by-packet round robin is invoked.
1193  *
1194  *	Returns:	NULL, if a packet is not available or if all
1195  *			classes are overlimit.
1196  *
1197  *			Otherwise, Pointer to the next packet.
1198  */
1199 
1200 mbuf_t *
rmc_dequeue_next(struct rm_ifdat * ifd,int mode)1201 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1202 {
1203 	if (ifd->queued_ >= ifd->maxqueued_)
1204 		return (NULL);
1205 	else if (ifd->wrr_)
1206 		return (_rmc_wrr_dequeue_next(ifd, mode));
1207 	else
1208 		return (_rmc_prr_dequeue_next(ifd, mode));
1209 }
1210 
1211 /*
1212  * Update the utilization estimate for the packet that just completed.
1213  * The packet's class & the parent(s) of that class all get their
1214  * estimators updated.  This routine is called by the driver's output-
1215  * packet-completion interrupt service routine.
1216  */
1217 
1218 /*
1219  * a macro to approximate "divide by 1000" that gives 0.000999,
1220  * if a value has enough effective digits.
1221  * (on pentium, mul takes 9 cycles but div takes 46!)
1222  */
1223 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1224 void
rmc_update_class_util(struct rm_ifdat * ifd)1225 rmc_update_class_util(struct rm_ifdat *ifd)
1226 {
1227 	int		 idle, avgidle, pktlen;
1228 	int		 pkt_time, tidle;
1229 	rm_class_t	*cl, *borrowed;
1230 	rm_class_t	*borrows;
1231 	struct timeval	*nowp;
1232 
1233 	/*
1234 	 * Get the most recent completed class.
1235 	 */
1236 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1237 		return;
1238 
1239 	pktlen = ifd->curlen_[ifd->qo_];
1240 	borrowed = ifd->borrowed_[ifd->qo_];
1241 	borrows = borrowed;
1242 
1243 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1244 
1245 	/*
1246 	 * Run estimator on class and its ancestors.
1247 	 */
1248 	/*
1249 	 * rm_update_class_util is designed to be called when the
1250 	 * transfer is completed from a xmit complete interrupt,
1251 	 * but most drivers don't implement an upcall for that.
1252 	 * so, just use estimated completion time.
1253 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1254 	 */
1255 	nowp = &ifd->now_[ifd->qo_];
1256 	/* get pkt_time (for link) in usec */
1257 #if 1  /* use approximation */
1258 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1259 	pkt_time = NSEC_TO_USEC(pkt_time);
1260 #else
1261 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1262 #endif
1263 #if 1 /* ALTQ4PPP */
1264 	if (TV_LT(nowp, &ifd->ifnow_)) {
1265 		int iftime;
1266 
1267 		/*
1268 		 * make sure the estimated completion time does not go
1269 		 * too far.  it can happen when the link layer supports
1270 		 * data compression or the interface speed is set to
1271 		 * a much lower value.
1272 		 */
1273 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1274 		if (iftime+pkt_time < ifd->maxiftime_) {
1275 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1276 		} else {
1277 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1278 		}
1279 	} else {
1280 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1281 	}
1282 #else
1283 	if (TV_LT(nowp, &ifd->ifnow_)) {
1284 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1285 	} else {
1286 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1287 	}
1288 #endif
1289 
1290 	while (cl != NULL) {
1291 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1292 		if (idle >= 2000000)
1293 			/*
1294 			 * this class is idle enough, reset avgidle.
1295 			 * (TV_DELTA returns 2000000 us when delta is large.)
1296 			 */
1297 			cl->avgidle_ = cl->maxidle_;
1298 
1299 		/* get pkt_time (for class) in usec */
1300 #if 1  /* use approximation */
1301 		pkt_time = pktlen * cl->ns_per_byte_;
1302 		pkt_time = NSEC_TO_USEC(pkt_time);
1303 #else
1304 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1305 #endif
1306 		idle -= pkt_time;
1307 
1308 		avgidle = cl->avgidle_;
1309 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1310 		cl->avgidle_ = avgidle;
1311 
1312 		/* Are we overlimit ? */
1313 		if (avgidle <= 0) {
1314 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1315 #if 1 /* ALTQ */
1316 			/*
1317 			 * need some lower bound for avgidle, otherwise
1318 			 * a borrowing class gets unbounded penalty.
1319 			 */
1320 			if (avgidle < cl->minidle_)
1321 				avgidle = cl->avgidle_ = cl->minidle_;
1322 #endif
1323 			/* set next idle to make avgidle 0 */
1324 			tidle = pkt_time +
1325 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1326 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1327 			++cl->stats_.over;
1328 		} else {
1329 			cl->avgidle_ =
1330 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1331 			cl->undertime_.tv_sec = 0;
1332 			if (cl->sleeping_) {
1333 				CALLOUT_STOP(&cl->callout_);
1334 				cl->sleeping_ = 0;
1335 			}
1336 		}
1337 
1338 		if (borrows != NULL) {
1339 			if (borrows != cl)
1340 				++cl->stats_.borrows;
1341 			else
1342 				borrows = NULL;
1343 		}
1344 		cl->last_ = ifd->ifnow_;
1345 		cl->last_pkttime_ = pkt_time;
1346 
1347 #if 1
1348 		if (cl->parent_ == NULL) {
1349 			/* take stats of root class */
1350 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1351 		}
1352 #endif
1353 
1354 		cl = cl->parent_;
1355 	}
1356 
1357 	/*
1358 	 * Check to see if cutoff needs to set to a new level.
1359 	 */
1360 	cl = ifd->class_[ifd->qo_];
1361 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1362 #if 1 /* ALTQ */
1363 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1364 			rmc_tl_satisfied(ifd, nowp);
1365 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1366 		} else {
1367 			ifd->cutoff_ = borrowed->depth_;
1368 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1369 		}
1370 #else /* !ALTQ */
1371 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1372 			reset_cutoff(ifd);
1373 #ifdef notdef
1374 			rmc_tl_satisfied(ifd, &now);
1375 #endif
1376 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1377 		} else {
1378 			ifd->cutoff_ = borrowed->depth_;
1379 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1380 		}
1381 #endif /* !ALTQ */
1382 	}
1383 
1384 	/*
1385 	 * Release class slot
1386 	 */
1387 	ifd->borrowed_[ifd->qo_] = NULL;
1388 	ifd->class_[ifd->qo_] = NULL;
1389 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1390 	ifd->queued_--;
1391 }
1392 
1393 /*
1394  * void
1395  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1396  *	over-limit action routines.  These get invoked by rmc_under_limit()
1397  *	if a class with packets to send if over its bandwidth limit & can't
1398  *	borrow from a parent class.
1399  *
1400  *	Returns: NONE
1401  */
1402 
1403 static void
rmc_drop_action(struct rm_class * cl)1404 rmc_drop_action(struct rm_class *cl)
1405 {
1406 	struct rm_ifdat	*ifd = cl->ifdat_;
1407 
1408 	ASSERT(qlen(cl->q_) > 0);
1409 	_rmc_dropq(cl);
1410 	if (qempty(cl->q_))
1411 		ifd->na_[cl->pri_]--;
1412 }
1413 
rmc_dropall(struct rm_class * cl)1414 void rmc_dropall(struct rm_class *cl)
1415 {
1416 	struct rm_ifdat	*ifd = cl->ifdat_;
1417 
1418 	if (!qempty(cl->q_)) {
1419 		_flushq(cl->q_);
1420 
1421 		ifd->na_[cl->pri_]--;
1422 	}
1423 }
1424 
1425 /*
1426  * void
1427  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1428  *	delay action routine.  It is invoked via rmc_under_limit when the
1429  *	packet is discovered to be overlimit.
1430  *
1431  *	If the delay action is result of borrow class being overlimit, then
1432  *	delay for the offtime of the borrowing class that is overlimit.
1433  *
1434  *	Returns: NONE
1435  */
1436 
1437 void
rmc_delay_action(struct rm_class * cl,struct rm_class * borrow)1438 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1439 {
1440 	int	delay, t, extradelay;
1441 
1442 	cl->stats_.overactions++;
1443 	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
1444 #ifndef BORROW_OFFTIME
1445 	delay += cl->offtime_;
1446 #endif
1447 
1448 	if (!cl->sleeping_) {
1449 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1450 #ifdef BORROW_OFFTIME
1451 		if (borrow != NULL)
1452 			extradelay = borrow->offtime_;
1453 		else
1454 #endif
1455 			extradelay = cl->offtime_;
1456 
1457 #ifdef ALTQ
1458 		/*
1459 		 * XXX recalculate suspend time:
1460 		 * current undertime is (tidle + pkt_time) calculated
1461 		 * from the last transmission.
1462 		 *	tidle: time required to bring avgidle back to 0
1463 		 *	pkt_time: target waiting time for this class
1464 		 * we need to replace pkt_time by offtime
1465 		 */
1466 		extradelay -= cl->last_pkttime_;
1467 #endif
1468 		if (extradelay > 0) {
1469 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1470 			delay += extradelay;
1471 		}
1472 
1473 		cl->sleeping_ = 1;
1474 		cl->stats_.delays++;
1475 
1476 		/*
1477 		 * Since packets are phased randomly with respect to the
1478 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1479 		 * short time so we have to wait for at least two ticks.
1480 		 * NOTE:  If there's no other traffic, we need the timer as
1481 		 * a 'backstop' to restart this class.
1482 		 */
1483 		if (delay > tick * 2) {
1484 #ifdef __FreeBSD__
1485 			/* FreeBSD rounds up the tick */
1486 			t = hzto(&cl->undertime_);
1487 #else
1488 			/* other BSDs round down the tick */
1489 			t = hzto(&cl->undertime_) + 1;
1490 #endif
1491 		} else
1492 			t = 2;
1493 		CALLOUT_RESET(&cl->callout_, t,
1494 			      (timeout_t *)rmc_restart, (caddr_t)cl);
1495 	}
1496 }
1497 
1498 /*
1499  * void
1500  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1501  *	called by the system timer code & is responsible checking if the
1502  *	class is still sleeping (it might have been restarted as a side
1503  *	effect of the queue scan on a packet arrival) and, if so, restarting
1504  *	output for the class.  Inspecting the class state & restarting output
1505  *	require locking the class structure.  In general the driver is
1506  *	responsible for locking but this is the only routine that is not
1507  *	called directly or indirectly from the interface driver so it has
1508  *	know about system locking conventions.  Under bsd, locking is done
1509  *	by raising IPL to splimp so that's what's implemented here.  On a
1510  *	different system this would probably need to be changed.
1511  *
1512  *	Returns:	NONE
1513  */
1514 
1515 static void
rmc_restart(struct rm_class * cl)1516 rmc_restart(struct rm_class *cl)
1517 {
1518 	struct rm_ifdat	*ifd = cl->ifdat_;
1519 	int		 s;
1520 
1521 	s = splimp();
1522 	if (cl->sleeping_) {
1523 		cl->sleeping_ = 0;
1524 		cl->undertime_.tv_sec = 0;
1525 
1526 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1527 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1528 			(ifd->restart)(ifd->ifq_);
1529 		}
1530 	}
1531 	splx(s);
1532 }
1533 
1534 /*
1535  * void
1536  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1537  *	handling routine for the root class of the link sharing structure.
1538  *
1539  *	Returns: NONE
1540  */
1541 
1542 static void
rmc_root_overlimit(struct rm_class * cl,struct rm_class * borrow)1543 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
1544 {
1545     panic("rmc_root_overlimit");
1546 }
1547 
1548 /*
1549  * Packet Queue handling routines.  Eventually, this is to localize the
1550  *	effects on the code whether queues are red queues or droptail
1551  *	queues.
1552  */
1553 
1554 static int
_rmc_addq(rm_class_t * cl,mbuf_t * m)1555 _rmc_addq(rm_class_t *cl, mbuf_t *m)
1556 {
1557 #ifdef ALTQ_RIO
1558 	if (q_is_rio(cl->q_))
1559 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1560 #endif
1561 #ifdef ALTQ_RED
1562 	if (q_is_red(cl->q_))
1563 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1564 #endif /* ALTQ_RED */
1565 
1566 	if (cl->flags_ & RMCF_CLEARDSCP)
1567 		write_dsfield(m, cl->pktattr_, 0);
1568 
1569 	_addq(cl->q_, m);
1570 	return (0);
1571 }
1572 
1573 /* note: _rmc_dropq is not called for red */
1574 static void
_rmc_dropq(rm_class_t * cl)1575 _rmc_dropq(rm_class_t *cl)
1576 {
1577 	mbuf_t	*m;
1578 
1579 	if ((m = _getq(cl->q_)) != NULL)
1580 		m_freem(m);
1581 }
1582 
1583 static mbuf_t *
_rmc_getq(rm_class_t * cl)1584 _rmc_getq(rm_class_t *cl)
1585 {
1586 #ifdef ALTQ_RIO
1587 	if (q_is_rio(cl->q_))
1588 		return rio_getq((rio_t *)cl->red_, cl->q_);
1589 #endif
1590 #ifdef ALTQ_RED
1591 	if (q_is_red(cl->q_))
1592 		return red_getq(cl->red_, cl->q_);
1593 #endif
1594 	return _getq(cl->q_);
1595 }
1596 
1597 static mbuf_t *
_rmc_pollq(rm_class_t * cl)1598 _rmc_pollq(rm_class_t *cl)
1599 {
1600 	return qhead(cl->q_);
1601 }
1602 
1603 #ifdef CBQ_TRACE
1604 
1605 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1606 struct cbqtrace		*cbqtrace_ptr = NULL;
1607 int			 cbqtrace_count;
1608 
1609 /*
1610  * DDB hook to trace cbq events:
1611  *  the last 1024 events are held in a circular buffer.
1612  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1613  */
1614 void cbqtrace_dump(int);
1615 static char *rmc_funcname(void *);
1616 
1617 static struct rmc_funcs {
1618 	void	*func;
1619 	char	*name;
1620 } rmc_funcs[] =
1621 {
1622 	rmc_init,		"rmc_init",
1623 	rmc_queue_packet,	"rmc_queue_packet",
1624 	rmc_under_limit,	"rmc_under_limit",
1625 	rmc_update_class_util,	"rmc_update_class_util",
1626 	rmc_delay_action,	"rmc_delay_action",
1627 	rmc_restart,		"rmc_restart",
1628 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1629 	NULL,			NULL
1630 };
1631 
rmc_funcname(void * func)1632 static char *rmc_funcname(void *func)
1633 {
1634 	struct rmc_funcs *fp;
1635 
1636 	for (fp = rmc_funcs; fp->func != NULL; fp++)
1637 		if (fp->func == func)
1638 			return (fp->name);
1639 	return ("unknown");
1640 }
1641 
cbqtrace_dump(int counter)1642 void cbqtrace_dump(int counter)
1643 {
1644 	int	 i, *p;
1645 	char	*cp;
1646 
1647 	counter = counter % NCBQTRACE;
1648 	p = (int *)&cbqtrace_buffer[counter];
1649 
1650 	for (i=0; i<20; i++) {
1651 		printf("[0x%x] ", *p++);
1652 		printf("%s: ", rmc_funcname((void *)*p++));
1653 		cp = (char *)p++;
1654 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1655 		printf("%d\n",*p++);
1656 
1657 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1658 			p = (int *)cbqtrace_buffer;
1659 	}
1660 }
1661 #endif /* CBQ_TRACE */
1662 
1663 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ)
1664 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1665 
1666 void
_addq(class_queue_t * q,mbuf_t * m)1667 _addq(class_queue_t *q, mbuf_t *m)
1668 {
1669         mbuf_t	*m0;
1670 
1671 	if ((m0 = qtail(q)) != NULL)
1672 		m->m_nextpkt = m0->m_nextpkt;
1673 	else
1674 		m0 = m;
1675 	m0->m_nextpkt = m;
1676 	qtail(q) = m;
1677 	qlen(q)++;
1678 }
1679 
1680 mbuf_t *
_getq(class_queue_t * q)1681 _getq(class_queue_t *q)
1682 {
1683 	mbuf_t	*m, *m0;
1684 
1685 	if ((m = qtail(q)) == NULL)
1686 		return (NULL);
1687 	if ((m0 = m->m_nextpkt) != m)
1688 		m->m_nextpkt = m0->m_nextpkt;
1689 	else {
1690 		ASSERT(qlen(q) == 1);
1691 		qtail(q) = NULL;
1692 	}
1693 	qlen(q)--;
1694 	m0->m_nextpkt = NULL;
1695 	return (m0);
1696 }
1697 
1698 /* drop a packet at the tail of the queue */
1699 mbuf_t *
_getq_tail(class_queue_t * q)1700 _getq_tail(class_queue_t *q)
1701 {
1702 	mbuf_t	*m, *m0, *prev;
1703 
1704 	if ((m = m0 = qtail(q)) == NULL)
1705 		return NULL;
1706 	do {
1707 		prev = m0;
1708 		m0 = m0->m_nextpkt;
1709 	} while (m0 != m);
1710 	prev->m_nextpkt = m->m_nextpkt;
1711 	if (prev == m)  {
1712 		ASSERT(qlen(q) == 1);
1713 		qtail(q) = NULL;
1714 	} else
1715 		qtail(q) = prev;
1716 	qlen(q)--;
1717 	m->m_nextpkt = NULL;
1718 	return (m);
1719 }
1720 
1721 /* randomly select a packet in the queue */
1722 mbuf_t *
_getq_random(class_queue_t * q)1723 _getq_random(class_queue_t *q)
1724 {
1725 	struct mbuf	*m;
1726 	int		 i, n;
1727 
1728 	if ((m = qtail(q)) == NULL)
1729 		return NULL;
1730 	if (m->m_nextpkt == m) {
1731 		ASSERT(qlen(q) == 1);
1732 		qtail(q) = NULL;
1733 	} else {
1734 		struct mbuf *prev = NULL;
1735 
1736 		n = arc4random_uniform(qlen(q)) + 1;
1737 		for (i = 0; i < n; i++) {
1738 			prev = m;
1739 			m = m->m_nextpkt;
1740 		}
1741 		prev->m_nextpkt = m->m_nextpkt;
1742 		if (m == qtail(q))
1743 			qtail(q) = prev;
1744 	}
1745 	qlen(q)--;
1746 	m->m_nextpkt = NULL;
1747 	return (m);
1748 }
1749 
1750 void
_removeq(class_queue_t * q,mbuf_t * m)1751 _removeq(class_queue_t *q, mbuf_t *m)
1752 {
1753 	mbuf_t	*m0, *prev;
1754 
1755 	m0 = qtail(q);
1756 	do {
1757 		prev = m0;
1758 		m0 = m0->m_nextpkt;
1759 	} while (m0 != m);
1760 	prev->m_nextpkt = m->m_nextpkt;
1761 	if (prev == m)
1762 		qtail(q) = NULL;
1763 	else if (qtail(q) == m)
1764 		qtail(q) = prev;
1765 	qlen(q)--;
1766 }
1767 
1768 void
_flushq(class_queue_t * q)1769 _flushq(class_queue_t *q)
1770 {
1771 	mbuf_t *m;
1772 
1773 	while ((m = _getq(q)) != NULL)
1774 		m_freem(m);
1775 	ASSERT(qlen(q) == 0);
1776 }
1777 
1778 #endif /* !__GNUC__ || ALTQ_DEBUG */
1779 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1780