1 /*        $NetBSD: altq_rmclass.c,v 1.32 2025/01/08 13:00:04 joe Exp $          */
2 /*        $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $  */
3 
4 /*
5  * Copyright (c) 1991-1997 Regents of the University of California.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by the Network Research
19  *      Group at Lawrence Berkeley Laboratory.
20  * 4. Neither the name of the University nor of the Laboratory may be used
21  *    to endorse or promote products derived from this software without
22  *    specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * LBL code modified by speer@eng.sun.com, May 1977.
37  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
38  */
39 
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: altq_rmclass.c,v 1.32 2025/01/08 13:00:04 joe Exp $");
42 
43 /* #ident "@(#)rm_class.c  1.48     97/12/05 SMI" */
44 
45 #ifdef _KERNEL_OPT
46 #include "opt_altq.h"
47 #include "opt_inet.h"
48 #endif
49 
50 #ifdef ALTQ_CBQ     /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
51 
52 #include <sys/param.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/socket.h>
56 #include <sys/systm.h>
57 #include <sys/errno.h>
58 #include <sys/time.h>
59 #ifdef ALTQ3_COMPAT
60 #include <sys/kernel.h>
61 #endif
62 #include <sys/cprng.h>
63 
64 #include <net/if.h>
65 #include <net/if_types.h>
66 #ifdef ALTQ3_COMPAT
67 #include <netinet/in.h>
68 #include <netinet/in_systm.h>
69 #include <netinet/ip.h>
70 #endif
71 
72 #include <altq/altq.h>
73 #include <altq/altq_rmclass.h>
74 #include <altq/altq_rmclass_debug.h>
75 #include <altq/altq_red.h>
76 #include <altq/altq_rio.h>
77 
78 /*
79  * Local Macros
80  */
81 
82 #define   reset_cutoff(ifd)   { ifd->cutoff_ = RM_MAXDEPTH; }
83 
84 #define   PSEC_TO_NSEC(t)     ((t) / 1000)
85 
86 /*
87  * Local routines.
88  */
89 
90 static int          rmc_satisfied(struct rm_class *, struct timespec *);
91 static void         rmc_wrr_set_weights(struct rm_ifdat *);
92 static void         rmc_depth_compute(struct rm_class *);
93 static void         rmc_depth_recompute(rm_class_t *);
94 
95 static mbuf_t       *_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
96 static mbuf_t       *_rmc_prr_dequeue_next(struct rm_ifdat *, int);
97 
98 static int          _rmc_addq(rm_class_t *, mbuf_t *);
99 static void         _rmc_dropq(rm_class_t *);
100 static mbuf_t       *_rmc_getq(rm_class_t *);
101 static mbuf_t       *_rmc_pollq(rm_class_t *);
102 
103 static int          rmc_under_limit(struct rm_class *, struct timespec *);
104 static void         rmc_tl_satisfied(struct rm_ifdat *, struct timespec *);
105 static void         rmc_drop_action(struct rm_class *);
106 static void         rmc_restart(struct rm_class *);
107 static void         rmc_root_overlimit(struct rm_class *, struct rm_class *);
108 
109 #define   BORROW_OFFTIME
110 /*
111  * BORROW_OFFTIME (experimental):
112  * borrow the offtime of the class borrowing from.
113  * the reason is that when its own offtime is set, the class is unable
114  * to borrow much, especially when cutoff is taking effect.
115  * but when the borrowed class is overloaded (advidle is close to minidle),
116  * use the borrowing class's offtime to avoid overload.
117  */
118 #define   ADJUST_CUTOFF
119 /*
120  * ADJUST_CUTOFF (experimental):
121  * if no underlimit class is found due to cutoff, increase cutoff and
122  * retry the scheduling loop.
123  * also, don't invoke delay_actions while cutoff is taking effect,
124  * since a sleeping class won't have a chance to be scheduled in the
125  * next loop.
126  *
127  * now heuristics for setting the top-level variable (cutoff_) becomes:
128  *        1. if a packet arrives for a not-overlimit class, set cutoff
129  *           to the depth of the class.
130  *        2. if cutoff is i, and a packet arrives for an overlimit class
131  *           with an underlimit ancestor at a lower level than i (say j),
132  *           then set cutoff to j.
133  *        3. at scheduling a packet, if there is no underlimit class
134  *           due to the current cutoff level, increase cutoff by 1 and
135  *           then try to schedule again.
136  */
137 
138 /*
139  * rm_class_t *
140  * rmc_newclass(...) - Create a new resource management class at priority
141  * 'pri' on the interface given by 'ifd'.
142  *
143  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
144  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
145  *              than 100% of the bandwidth, this number should be the
146  *              'effective' rate for the class.  Let f be the
147  *              bandwidth fraction allocated to this class, and let
148  *              nsPerByte be the data rate of the output link in
149  *              nanoseconds/byte.  Then nsecPerByte is set to
150  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
151  *              for a class that gets 50% of an ethernet's bandwidth.
152  *
153  * action       the routine to call when the class is over limit.
154  *
155  * maxq         max allowable queue size for class (in packets).
156  *
157  * parent       parent class pointer.
158  *
159  * borrow       class to borrow from (should be either 'parent' or null).
160  *
161  * maxidle      max value allowed for class 'idle' time estimate (this
162  *              parameter determines how large an initial burst of packets
163  *              can be before overlimit action is invoked.
164  *
165  * offtime      how long 'delay' action will delay when class goes over
166  *              limit (this parameter determines the steady-state burst
167  *              size when a class is running over its limit).
168  *
169  * Maxidle and offtime have to be computed from the following:  If the
170  * average packet size is s, the bandwidth fraction allocated to this
171  * class is f, we want to allow b packet bursts, and the gain of the
172  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
173  *
174  *   ptime = s * nsPerByte * (1 - f) / f
175  *   maxidle = ptime * (1 - g^b) / g^b
176  *   minidle = -ptime * (1 / (f - 1))
177  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
178  *
179  * Operationally, it's convenient to specify maxidle & offtime in units
180  * independent of the link bandwidth so the maxidle & offtime passed to
181  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
182  * (The constant factor is a scale factor needed to make the parameters
183  * integers.  This scaling also means that the 'unscaled' values of
184  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
185  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
186  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
187  * maxidle also must be scaled upward by this value.  Thus, the passed
188  * values for maxidle and offtime can be computed as follows:
189  *
190  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
191  * offtime = offtime * 8 / (1000 * nsecPerByte)
192  *
193  * When USE_HRTIME is employed, then maxidle and offtime become:
194  *        maxidle = maxilde * (8.0 / nsecPerByte);
195  *        offtime = offtime * (8.0 / nsecPerByte);
196  */
197 struct rm_class *
rmc_newclass(int pri,struct rm_ifdat * ifd,uint64_t psecPerByte,void (* action)(rm_class_t *,rm_class_t *),int maxq,struct rm_class * parent,struct rm_class * borrow,u_int maxidle,int minidle,u_int offtime,int pktsize,int flags)198 rmc_newclass(int pri, struct rm_ifdat *ifd, uint64_t psecPerByte,
199     void (*action)(rm_class_t *, rm_class_t *), int maxq,
200     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
201     int minidle, u_int offtime, int pktsize, int flags)
202 {
203           struct rm_class     *cl;
204           struct rm_class     *peer;
205           int                  s;
206 
207           if (pri >= RM_MAXPRIO)
208                     return NULL;
209 #ifndef ALTQ_RED
210           if (flags & RMCF_RED) {
211 #ifdef ALTQ_DEBUG
212                     printf("rmc_newclass: RED not configured for CBQ!\n");
213 #endif
214                     return NULL;
215           }
216 #endif
217 #ifndef ALTQ_RIO
218           if (flags & RMCF_RIO) {
219 #ifdef ALTQ_DEBUG
220                     printf("rmc_newclass: RIO not configured for CBQ!\n");
221 #endif
222                     return NULL;
223           }
224 #endif
225 
226           cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_WAITOK|M_ZERO);
227           if (cl == NULL)
228                     return NULL;
229           CALLOUT_INIT(&cl->callout_);
230 
231           cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
232           if (cl->q_ == NULL) {
233                     free(cl, M_DEVBUF);
234                     return NULL;
235           }
236 
237           /*
238            * Class initialization.
239            */
240           cl->children_ = NULL;
241           cl->parent_ = parent;
242           cl->borrow_ = borrow;
243           cl->leaf_ = 1;
244           cl->ifdat_ = ifd;
245           cl->pri_ = pri;
246           cl->allotment_ = (u_int)(RM_PS_PER_SEC / psecPerByte); /* Bytes per sec */
247           cl->depth_ = 0;
248           cl->qthresh_ = 0;
249           cl->ps_per_byte_ = psecPerByte;
250 
251           qlimit(cl->q_) = maxq;
252           qtype(cl->q_) = Q_DROPHEAD;
253           qlen(cl->q_) = 0;
254           cl->flags_ = flags;
255 
256 #if 1 /* minidle is also scaled in ALTQ */
257           cl->minidle_ = ((int64_t)minidle * (int64_t)psecPerByte) / 8;
258           if (cl->minidle_ > 0)
259                     cl->minidle_ = 0;
260 #else
261           cl->minidle_ = minidle;
262 #endif
263           cl->maxidle_ = ((int64_t)maxidle * (int64_t)psecPerByte) / 8;
264           if (cl->maxidle_ == 0)
265                     cl->maxidle_ = 1;
266 #if 1 /* offtime is also scaled in ALTQ */
267           cl->avgidle_ = cl->maxidle_;
268           cl->offtime_ = (((int64_t)offtime * (int64_t)psecPerByte) / 8) >> RM_FILTER_GAIN;
269           if (cl->offtime_ == 0)
270                     cl->offtime_ = 1;
271 #else
272           cl->avgidle_ = 0;
273           cl->offtime_ = (offtime * nsecPerByte) / 8;
274 #endif
275           cl->overlimit = action;
276 
277 #ifdef ALTQ_RED
278           if (flags & (RMCF_RED|RMCF_RIO)) {
279                     int red_flags, red_pkttime;
280 
281                     red_flags = 0;
282                     if (flags & RMCF_ECN)
283                               red_flags |= REDF_ECN;
284                     if (flags & RMCF_FLOWVALVE)
285                               red_flags |= REDF_FLOWVALVE;
286 #ifdef ALTQ_RIO
287                     if (flags & RMCF_CLEARDSCP)
288                               red_flags |= RIOF_CLEARDSCP;
289 #endif
290                     red_pkttime = PSEC_TO_NSEC(psecPerByte) * pktsize  / 1000;
291 
292                     if (flags & RMCF_RED) {
293                               cl->red_ = red_alloc(0, 0,
294                                   qlimit(cl->q_) * 10/100,
295                                   qlimit(cl->q_) * 30/100,
296                                   red_flags, red_pkttime);
297                               if (cl->red_ != NULL)
298                                         qtype(cl->q_) = Q_RED;
299                     }
300 #ifdef ALTQ_RIO
301                     else {
302                               cl->red_ = (red_t *)rio_alloc(0, NULL,
303                                                                   red_flags, red_pkttime);
304                               if (cl->red_ != NULL)
305                                         qtype(cl->q_) = Q_RIO;
306                     }
307 #endif
308           }
309 #endif /* ALTQ_RED */
310 
311           /*
312            * put the class into the class tree
313            */
314           s = splnet();
315           if ((peer = ifd->active_[pri]) != NULL) {
316                     /* find the last class at this pri */
317                     cl->peer_ = peer;
318                     while (peer->peer_ != ifd->active_[pri])
319                               peer = peer->peer_;
320                     peer->peer_ = cl;
321           } else {
322                     ifd->active_[pri] = cl;
323                     cl->peer_ = cl;
324           }
325 
326           if (cl->parent_) {
327                     cl->next_ = parent->children_;
328                     parent->children_ = cl;
329                     parent->leaf_ = 0;
330           }
331 
332           /*
333            * Compute the depth of this class and its ancestors in the class
334            * hierarchy.
335            */
336           rmc_depth_compute(cl);
337 
338           /*
339            * If CBQ's WRR is enabled, then initialize the class WRR state.
340            */
341           if (ifd->wrr_) {
342                     ifd->num_[pri]++;
343                     ifd->alloc_[pri] += cl->allotment_;
344                     rmc_wrr_set_weights(ifd);
345           }
346           splx(s);
347           return cl;
348 }
349 
350 int
rmc_modclass(struct rm_class * cl,uint64_t psecPerByte,int maxq,u_int maxidle,int minidle,u_int offtime,int pktsize)351 rmc_modclass(struct rm_class *cl, uint64_t psecPerByte, int maxq, u_int maxidle,
352     int minidle, u_int offtime, int pktsize)
353 {
354           struct rm_ifdat     *ifd;
355           u_int                old_allotment;
356           int                  s;
357 
358           ifd = cl->ifdat_;
359           old_allotment = cl->allotment_;
360 
361           s = splnet();
362           cl->allotment_ = (u_int)(RM_PS_PER_SEC / psecPerByte); /* Bytes per sec */
363           cl->qthresh_ = 0;
364           cl->ps_per_byte_ = psecPerByte;
365 
366           qlimit(cl->q_) = maxq;
367 
368 #if 1 /* minidle is also scaled in ALTQ */
369           cl->minidle_ = ((int64_t)minidle * (int64_t)psecPerByte) / 8;
370           if (cl->minidle_ > 0)
371                     cl->minidle_ = 0;
372 #else
373           cl->minidle_ = minidle;
374 #endif
375           cl->maxidle_ = ((int64_t)maxidle * (int64_t)psecPerByte) / 8;
376           if (cl->maxidle_ == 0)
377                     cl->maxidle_ = 1;
378 #if 1 /* offtime is also scaled in ALTQ */
379           cl->avgidle_ = cl->maxidle_;
380           cl->offtime_ = (((int64_t)offtime * (int64_t)psecPerByte) / 8) >> RM_FILTER_GAIN;
381           if (cl->offtime_ == 0)
382                     cl->offtime_ = 1;
383 #else
384           cl->avgidle_ = 0;
385           cl->offtime_ = (offtime * nsecPerByte) / 8;
386 #endif
387 
388           /*
389            * If CBQ's WRR is enabled, then initialize the class WRR state.
390            */
391           if (ifd->wrr_) {
392                     ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
393                     rmc_wrr_set_weights(ifd);
394           }
395           splx(s);
396           return 0;
397 }
398 
399 /*
400  * static void
401  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
402  *        the appropriate run robin weights for the CBQ weighted round robin
403  *        algorithm.
404  *
405  *        Returns: NONE
406  */
407 
408 static void
rmc_wrr_set_weights(struct rm_ifdat * ifd)409 rmc_wrr_set_weights(struct rm_ifdat *ifd)
410 {
411           int                 i;
412           struct rm_class     *cl, *clh;
413 
414           for (i = 0; i < RM_MAXPRIO; i++) {
415                     /*
416                      * This is inverted from that of the simulator to
417                      * maintain precision.
418                      */
419                     if (ifd->num_[i] == 0)
420                               ifd->M_[i] = 0;
421                     else
422                               ifd->M_[i] = ifd->alloc_[i] /
423                                         (ifd->num_[i] * ifd->maxpkt_);
424                     /*
425                      * Compute the weighted allotment for each class.
426                      * This takes the expensive div instruction out
427                      * of the main loop for the wrr scheduling path.
428                      * These only get recomputed when a class comes or
429                      * goes.
430                      */
431                     if (ifd->active_[i] != NULL) {
432                               clh = cl = ifd->active_[i];
433                               do {
434                                         /* safe-guard for slow link or alloc_ == 0 */
435                                         if (ifd->M_[i] == 0)
436                                                   cl->w_allotment_ = 0;
437                                         else
438                                                   cl->w_allotment_ = cl->allotment_ /
439                                                             ifd->M_[i];
440                                         cl = cl->peer_;
441                               } while ((cl != NULL) && (cl != clh));
442                     }
443           }
444 }
445 
446 int
rmc_get_weight(struct rm_ifdat * ifd,int pri)447 rmc_get_weight(struct rm_ifdat *ifd, int pri)
448 {
449           if ((pri >= 0) && (pri < RM_MAXPRIO))
450                     return (ifd->M_[pri]);
451           else
452                     return 0;
453 }
454 
455 /*
456  * static void
457  * rmc_depth_compute(struct rm_class *cl) - This function computes the
458  *        appropriate depth of class 'cl' and its ancestors.
459  *
460  *        Returns:  NONE
461  */
462 
463 static void
rmc_depth_compute(struct rm_class * cl)464 rmc_depth_compute(struct rm_class *cl)
465 {
466           rm_class_t          *t = cl, *p;
467 
468           /*
469            * Recompute the depth for the branch of the tree.
470            */
471           while (t != NULL) {
472                     p = t->parent_;
473                     if (p && (t->depth_ >= p->depth_)) {
474                               p->depth_ = t->depth_ + 1;
475                               t = p;
476                     } else
477                               t = NULL;
478           }
479 }
480 
481 /*
482  * static void
483  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
484  *        the depth of the tree after a class has been deleted.
485  *
486  *        Returns:  NONE
487  */
488 
489 static void
rmc_depth_recompute(rm_class_t * cl)490 rmc_depth_recompute(rm_class_t *cl)
491 {
492 #if 1 /* ALTQ */
493           rm_class_t          *p, *t;
494 
495           p = cl;
496           while (p != NULL) {
497                     if ((t = p->children_) == NULL) {
498                               p->depth_ = 0;
499                     } else {
500                               int cdepth = 0;
501 
502                               while (t != NULL) {
503                                         if (t->depth_ > cdepth)
504                                                   cdepth = t->depth_;
505                                         t = t->next_;
506                               }
507 
508                               if (p->depth_ == cdepth + 1)
509                                         /* no change to this parent */
510                                         return;
511 
512                               p->depth_ = cdepth + 1;
513                     }
514 
515                     p = p->parent_;
516           }
517 #else
518           rm_class_t          *t;
519 
520           if (cl->depth_ >= 1) {
521                     if (cl->children_ == NULL) {
522                               cl->depth_ = 0;
523                     } else if ((t = cl->children_) != NULL) {
524                               while (t != NULL) {
525                                         if (t->children_ != NULL)
526                                                   rmc_depth_recompute(t);
527                                         t = t->next_;
528                               }
529                     } else
530                               rmc_depth_compute(cl);
531           }
532 #endif
533 }
534 
535 /*
536  * void
537  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
538  *        function deletes a class from the link-sharing structure and frees
539  *        all resources associated with the class.
540  *
541  *        Returns: NONE
542  */
543 
544 void
rmc_delete_class(struct rm_ifdat * ifd,struct rm_class * cl)545 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
546 {
547           struct rm_class     *p, *head, *previous;
548           int                  s;
549 
550           ASSERT(cl->children_ == NULL);
551 
552           if (cl->sleeping_)
553                     CALLOUT_STOP(&cl->callout_);
554 
555           s = splnet();
556           /*
557            * Free packets in the packet queue.
558            * XXX - this may not be a desired behavior.  Packets should be
559            *                  re-queued.
560            */
561           rmc_dropall(cl);
562 
563           /*
564            * If the class has a parent, then remove the class from the
565            * class from the parent's children chain.
566            */
567           if (cl->parent_ != NULL) {
568                     head = cl->parent_->children_;
569                     p = previous = head;
570                     if (head->next_ == NULL) {
571                               ASSERT(head == cl);
572                               cl->parent_->children_ = NULL;
573                               cl->parent_->leaf_ = 1;
574                     } else while (p != NULL) {
575                               if (p == cl) {
576                                         if (cl == head)
577                                                   cl->parent_->children_ = cl->next_;
578                                         else
579                                                   previous->next_ = cl->next_;
580                                         cl->next_ = NULL;
581                                         p = NULL;
582                               } else {
583                                         previous = p;
584                                         p = p->next_;
585                               }
586                     }
587           }
588 
589           /*
590            * Delete class from class priority peer list.
591            */
592           if ((p = ifd->active_[cl->pri_]) != NULL) {
593                     /*
594                      * If there is more than one member of this priority
595                      * level, then look for class(cl) in the priority level.
596                      */
597                     if (p != p->peer_) {
598                               while (p->peer_ != cl)
599                                         p = p->peer_;
600                               p->peer_ = cl->peer_;
601 
602                               if (ifd->active_[cl->pri_] == cl)
603                                         ifd->active_[cl->pri_] = cl->peer_;
604                     } else {
605                               ASSERT(p == cl);
606                               ifd->active_[cl->pri_] = NULL;
607                     }
608           }
609 
610           /*
611            * Recompute the WRR weights.
612            */
613           if (ifd->wrr_) {
614                     ifd->alloc_[cl->pri_] -= cl->allotment_;
615                     ifd->num_[cl->pri_]--;
616                     rmc_wrr_set_weights(ifd);
617           }
618 
619           /*
620            * Re-compute the depth of the tree.
621            */
622 #if 1 /* ALTQ */
623           rmc_depth_recompute(cl->parent_);
624 #else
625           rmc_depth_recompute(ifd->root_);
626 #endif
627 
628           splx(s);
629 
630           /*
631            * Free the class structure.
632            */
633           if (cl->red_ != NULL) {
634 #ifdef ALTQ_RIO
635                     if (q_is_rio(cl->q_))
636                               rio_destroy((rio_t *)cl->red_);
637 #endif
638 #ifdef ALTQ_RED
639                     if (q_is_red(cl->q_))
640                               red_destroy(cl->red_);
641 #endif
642           }
643           free(cl->q_, M_DEVBUF);
644           free(cl, M_DEVBUF);
645 }
646 
647 
648 /*
649  * int
650  * rmc_init(...) - Initialize the resource management data structures
651  *        associated with the output portion of interface 'ifp'.  'ifd' is
652  *        where the structures will be built (for backwards compatibility, the
653  *        structures aren't kept in the ifnet struct).  'nsecPerByte'
654  *        gives the link speed (inverse of bandwidth) in nanoseconds/byte.
655  *        'restart' is the driver-specific routine that the generic 'delay
656  *        until under limit' action will call to restart output.  `maxq'
657  *        is the queue size of the 'link' & 'default' classes.  'maxqueued'
658  *        is the maximum number of packets that the resource management
659  *        code will allow to be queued 'downstream' (this is typically 1).
660  *
661  *        Returns:  0 on success
662  */
663 
664 int
rmc_init(struct ifaltq * ifq,struct rm_ifdat * ifd,uint64_t psecPerByte,void (* restart)(struct ifaltq *),int maxq,int maxqueued,u_int maxidle,int minidle,u_int offtime,int flags)665 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, uint64_t psecPerByte,
666     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
667     int minidle, u_int offtime, int flags)
668 {
669           int i, mtu;
670 
671           /*
672            * Initialize the CBQ tracing/debug facility.
673            */
674           CBQTRACEINIT();
675 
676           mtu = ifq->altq_ifp->if_mtu;
677           if (mtu < 1) {
678                     printf("altq: %s: invalid MTU (interface not initialized?)\n",
679                         ifq->altq_ifp->if_xname);
680                     return EINVAL;
681           }
682 
683           (void)memset((char *)ifd, 0, sizeof (*ifd));
684           ifd->ifq_ = ifq;
685           ifd->restart = restart;
686           ifd->maxqueued_ = maxqueued;
687           ifd->ps_per_byte_ = psecPerByte;
688           ifd->maxpkt_ = mtu;
689           ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
690           ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
691 #if 1
692           ifd->maxiftime_ = mtu * psecPerByte / 1000 / 1000 * 16;
693           if ((int64_t)mtu * psecPerByte > (int64_t)10 * 1000000000)
694                     ifd->maxiftime_ /= 4;
695 #endif
696 
697           reset_cutoff(ifd);
698           CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
699 
700           /*
701            * Initialize the CBQ's WRR state.
702            */
703           for (i = 0; i < RM_MAXPRIO; i++) {
704                     ifd->alloc_[i] = 0;
705                     ifd->M_[i] = 0;
706                     ifd->num_[i] = 0;
707                     ifd->na_[i] = 0;
708                     ifd->active_[i] = NULL;
709           }
710 
711           /*
712            * Initialize current packet state.
713            */
714           ifd->qi_ = 0;
715           ifd->qo_ = 0;
716           for (i = 0; i < RM_MAXQUEUED; i++) {
717                     ifd->class_[i] = NULL;
718                     ifd->curlen_[i] = 0;
719                     ifd->borrowed_[i] = NULL;
720           }
721 
722           /*
723            * Create the root class of the link-sharing structure.
724            */
725           if ((ifd->root_ = rmc_newclass(0, ifd,
726                                                psecPerByte,
727                                                rmc_root_overlimit, maxq, 0, 0,
728                                                maxidle, minidle, offtime,
729                                                0, 0)) == NULL) {
730                     printf("rmc_init: root class not allocated\n");
731                     return ENOMEM;
732           }
733           ifd->root_->depth_ = 0;
734 
735           return 0;
736 }
737 
738 /*
739  * void
740  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
741  *        mbuf 'm' to queue for resource class 'cl'.  This routine is called
742  *        by a driver's if_output routine.  This routine must be called with
743  *        output packet completion interrupts locked out (to avoid racing with
744  *        rmc_dequeue_next).
745  *
746  *        Returns:  0 on successful queueing
747  *                            -1 when packet drop occurs
748  */
749 int
rmc_queue_packet(struct rm_class * cl,mbuf_t * m)750 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
751 {
752           struct timespec      now;
753           struct rm_ifdat *ifd = cl->ifdat_;
754           int                  cpri = cl->pri_;
755           int                  is_empty = qempty(cl->q_);
756 
757           RM_GETTIME(now);
758           if (ifd->cutoff_ > 0) {
759                     if (TS_LT(&cl->undertime_, &now)) {
760                               if (ifd->cutoff_ > cl->depth_)
761                                         ifd->cutoff_ = cl->depth_;
762                               CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
763                     }
764 #if 1 /* ALTQ */
765                     else {
766                               /*
767                                * the class is overlimit. if the class has
768                                * underlimit ancestors, set cutoff to the lowest
769                                * depth among them.
770                                */
771                               struct rm_class *borrow = cl->borrow_;
772 
773                               while (borrow != NULL &&
774                                      borrow->depth_ < ifd->cutoff_) {
775                                         if (TS_LT(&borrow->undertime_, &now)) {
776                                                   ifd->cutoff_ = borrow->depth_;
777                                                   CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
778                                                   break;
779                                         }
780                                         borrow = borrow->borrow_;
781                               }
782                     }
783 #else /* !ALTQ */
784                     else if ((ifd->cutoff_ > 1) && cl->borrow_) {
785                               if (TS_LT(&cl->borrow_->undertime_, &now)) {
786                                         ifd->cutoff_ = cl->borrow_->depth_;
787                                         CBQTRACE(rmc_queue_packet, 'ffob',
788                                                    cl->borrow_->depth_);
789                               }
790                     }
791 #endif /* !ALTQ */
792           }
793 
794           if (_rmc_addq(cl, m) < 0)
795                     /* failed */
796                     return -1;
797 
798           if (is_empty) {
799                     CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
800                     ifd->na_[cpri]++;
801           }
802 
803           if (qlen(cl->q_) > qlimit(cl->q_)) {
804                     /* note: qlimit can be set to 0 or 1 */
805                     rmc_drop_action(cl);
806                     return -1;
807           }
808           return 0;
809 }
810 
811 /*
812  * void
813  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timespec *now) - Check all
814  *        classes to see if there are satified.
815  */
816 
817 static void
rmc_tl_satisfied(struct rm_ifdat * ifd,struct timespec * now)818 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timespec *now)
819 {
820           int                  i;
821           rm_class_t          *p, *bp;
822 
823           for (i = RM_MAXPRIO - 1; i >= 0; i--) {
824                     if ((bp = ifd->active_[i]) != NULL) {
825                               p = bp;
826                               do {
827                                         if (!rmc_satisfied(p, now)) {
828                                                   ifd->cutoff_ = p->depth_;
829                                                   return;
830                                         }
831                                         p = p->peer_;
832                               } while (p != bp);
833                     }
834           }
835 
836           reset_cutoff(ifd);
837 }
838 
839 /*
840  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
841  */
842 
843 static int
rmc_satisfied(struct rm_class * cl,struct timespec * now)844 rmc_satisfied(struct rm_class *cl, struct timespec *now)
845 {
846           rm_class_t          *p;
847 
848           if (cl == NULL)
849                     return 1;
850           if (TS_LT(now, &cl->undertime_))
851                     return 1;
852           if (cl->depth_ == 0) {
853                     if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
854                               return 0;
855                     else
856                               return 1;
857           }
858           if (cl->children_ != NULL) {
859                     p = cl->children_;
860                     while (p != NULL) {
861                               if (!rmc_satisfied(p, now))
862                                         return 0;
863                               p = p->next_;
864                     }
865           }
866 
867           return 1;
868 }
869 
870 /*
871  * Return 1 if class 'cl' is under limit or can borrow from a parent,
872  * 0 if overlimit.  As a side-effect, this routine will invoke the
873  * class overlimit action if the class if overlimit.
874  */
875 
876 static int
rmc_under_limit(struct rm_class * cl,struct timespec * now)877 rmc_under_limit(struct rm_class *cl, struct timespec *now)
878 {
879           rm_class_t          *p = cl;
880           rm_class_t          *top;
881           struct rm_ifdat     *ifd = cl->ifdat_;
882           int sleeping = cl->sleeping_;
883 
884           ifd->borrowed_[ifd->qi_] = NULL;
885           /*
886            * If cl is the root class, then always return that it is
887            * underlimit.  Otherwise, check to see if the class is underlimit.
888            */
889           if (cl->parent_ == NULL)
890                     return 1;
891 
892           if (!TS_LT(now, &cl->undertime_)) {
893                     /* Fast path: the given class is allowed to send packets */
894                     if (sleeping) {
895                               CALLOUT_STOP(&cl->callout_);
896                               cl->sleeping_ = 0;
897                               cl->undertime_.tv_sec = 0;
898                     }
899                     return 1;
900           }
901 
902           top = NULL;
903           do {
904                     if (((cl = cl->borrow_) == NULL) ||
905                         (cl->depth_ > ifd->cutoff_)) {
906                               /* No need to call the delay action */
907                               if (sleeping)
908                                         return 0;
909 #ifdef ADJUST_CUTOFF
910                               if (cl != NULL)
911                                         /* cutoff is taking effect, just
912                                            return false without calling
913                                            the delay action. */
914                                         return 0;
915 #endif
916 #ifdef BORROW_OFFTIME
917                               /*
918                                * check if the class can borrow offtime too.
919                                * borrow offtime from the top of the borrow
920                                * chain if the top class is not overloaded.
921                                */
922                               if (cl != NULL) {
923                                         /* cutoff is taking effect, use this class as top. */
924                                         top = cl;
925                                         CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
926                               }
927                               if (top != NULL && top->avgidle_ == top->minidle_)
928                                         top = NULL;
929                               p->overtime_ = *now;
930                               (p->overlimit)(p, top);
931 #else
932                               p->overtime_ = *now;
933                               (p->overlimit)(p, NULL);
934 #endif
935                               return 0;
936                     }
937                     top = cl;
938           } while (cl->undertime_.tv_sec && TS_LT(now, &cl->undertime_));
939 
940           if (cl != p)
941                     ifd->borrowed_[ifd->qi_] = cl;
942           return 1;
943 }
944 
945 /*
946  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
947  *        Packet-by-packet round robin.
948  *
949  * The heart of the weighted round-robin scheduler, which decides which
950  * class next gets to send a packet.  Highest priority first, then
951  * weighted round-robin within priorites.
952  *
953  * Each able-to-send class gets to send until its byte allocation is
954  * exhausted.  Thus, the active pointer is only changed after a class has
955  * exhausted its allocation.
956  *
957  * If the scheduler finds no class that is underlimit or able to borrow,
958  * then the first class found that had a nonzero queue and is allowed to
959  * borrow gets to send.
960  */
961 
962 static mbuf_t *
_rmc_wrr_dequeue_next(struct rm_ifdat * ifd,int op)963 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
964 {
965           struct rm_class     *cl = NULL, *first = NULL;
966           u_int                deficit;
967           int                  cpri;
968           mbuf_t              *m;
969           struct timespec      now;
970 
971           RM_GETTIME(now);
972 
973           /*
974            * if the driver polls the top of the queue and then removes
975            * the polled packet, we must return the same packet.
976            */
977           if (op == ALTDQ_REMOVE && ifd->pollcache_) {
978                     cl = ifd->pollcache_;
979                     cpri = cl->pri_;
980                     if (ifd->efficient_) {
981                               /* check if this class is overlimit */
982                               if (cl->undertime_.tv_sec != 0 &&
983                                   rmc_under_limit(cl, &now) == 0)
984                                         first = cl;
985                     }
986                     ifd->pollcache_ = NULL;
987                     goto _wrr_out;
988           }
989           else {
990                     /* mode == ALTDQ_POLL || pollcache == NULL */
991                     ifd->pollcache_ = NULL;
992                     ifd->borrowed_[ifd->qi_] = NULL;
993           }
994 #ifdef ADJUST_CUTOFF
995  _again:
996 #endif
997           for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
998                     if (ifd->na_[cpri] == 0)
999                               continue;
1000                     deficit = 0;
1001                     /*
1002                      * Loop through twice for a priority level, if some class
1003                      * was unable to send a packet the first round because
1004                      * of the weighted round-robin mechanism.
1005                      * During the second loop at this level, deficit==2.
1006                      * (This second loop is not needed if for every class,
1007                      * "M[cl->pri_])" times "cl->allotment" is greater than
1008                      * the byte size for the largest packet in the class.)
1009                      */
1010  _wrr_loop:
1011                     cl = ifd->active_[cpri];
1012                     ASSERT(cl != NULL);
1013                     do {
1014                               if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1015                                         cl->bytes_alloc_ += cl->w_allotment_;
1016                               if (!qempty(cl->q_)) {
1017                                         if ((cl->undertime_.tv_sec == 0) ||
1018                                             rmc_under_limit(cl, &now)) {
1019                                                   if (cl->bytes_alloc_ > 0 || deficit > 1)
1020                                                             goto _wrr_out;
1021 
1022                                                   /* underlimit but no alloc */
1023                                                   deficit = 1;
1024 #if 1
1025                                                   ifd->borrowed_[ifd->qi_] = NULL;
1026 #endif
1027                                         }
1028                                         else if (first == NULL && cl->borrow_ != NULL)
1029                                                   first = cl; /* borrowing candidate */
1030                               }
1031 
1032                               cl->bytes_alloc_ = 0;
1033                               cl = cl->peer_;
1034                     } while (cl != ifd->active_[cpri]);
1035 
1036                     if (deficit == 1) {
1037                               /* first loop found an underlimit class with deficit */
1038                               /* Loop on same priority level, with new deficit.  */
1039                               deficit = 2;
1040                               goto _wrr_loop;
1041                     }
1042           }
1043 
1044 #ifdef ADJUST_CUTOFF
1045           /*
1046            * no underlimit class found.  if cutoff is taking effect,
1047            * increase cutoff and try again.
1048            */
1049           if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1050                     ifd->cutoff_++;
1051                     CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1052                     goto _again;
1053           }
1054 #endif /* ADJUST_CUTOFF */
1055           /*
1056            * If LINK_EFFICIENCY is turned on, then the first overlimit
1057            * class we encounter will send a packet if all the classes
1058            * of the link-sharing structure are overlimit.
1059            */
1060           reset_cutoff(ifd);
1061           CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1062 
1063           if (!ifd->efficient_ || first == NULL)
1064                     return NULL;
1065 
1066           cl = first;
1067           cpri = cl->pri_;
1068 #if 0     /* too time-consuming for nothing */
1069           if (cl->sleeping_)
1070                     CALLOUT_STOP(&cl->callout_);
1071           cl->sleeping_ = 0;
1072           cl->undertime_.tv_sec = 0;
1073 #endif
1074           ifd->borrowed_[ifd->qi_] = cl->borrow_;
1075           ifd->cutoff_ = cl->borrow_->depth_;
1076 
1077           /*
1078            * Deque the packet and do the book keeping...
1079            */
1080  _wrr_out:
1081           if (op == ALTDQ_REMOVE) {
1082                     m = _rmc_getq(cl);
1083                     if (m == NULL)
1084                               panic("_rmc_wrr_dequeue_next");
1085                     if (qempty(cl->q_))
1086                               ifd->na_[cpri]--;
1087 
1088                     /*
1089                      * Update class statistics and link data.
1090                      */
1091                     if (cl->bytes_alloc_ > 0)
1092                               cl->bytes_alloc_ -= m_pktlen(m);
1093 
1094                     if ((cl->bytes_alloc_ <= 0) || first == cl)
1095                               ifd->active_[cl->pri_] = cl->peer_;
1096                     else
1097                               ifd->active_[cl->pri_] = cl;
1098 
1099                     ifd->class_[ifd->qi_] = cl;
1100                     ifd->curlen_[ifd->qi_] = m_pktlen(m);
1101                     ifd->now_[ifd->qi_] = now;
1102                     ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1103                     ifd->queued_++;
1104           } else {
1105                     /* mode == ALTDQ_PPOLL */
1106                     m = _rmc_pollq(cl);
1107                     ifd->pollcache_ = cl;
1108           }
1109           return m;
1110 }
1111 
1112 /*
1113  * Dequeue & return next packet from the highest priority class that
1114  * has a packet to send & has enough allocation to send it.  This
1115  * routine is called by a driver whenever it needs a new packet to
1116  * output.
1117  */
1118 static mbuf_t *
_rmc_prr_dequeue_next(struct rm_ifdat * ifd,int op)1119 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1120 {
1121           mbuf_t              *m;
1122           int                  cpri;
1123           struct rm_class     *cl, *first = NULL;
1124           struct timespec      now;
1125 
1126           RM_GETTIME(now);
1127 
1128           /*
1129            * if the driver polls the top of the queue and then removes
1130            * the polled packet, we must return the same packet.
1131            */
1132           if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1133                     cl = ifd->pollcache_;
1134                     cpri = cl->pri_;
1135                     ifd->pollcache_ = NULL;
1136                     goto _prr_out;
1137           } else {
1138                     /* mode == ALTDQ_POLL || pollcache == NULL */
1139                     ifd->pollcache_ = NULL;
1140                     ifd->borrowed_[ifd->qi_] = NULL;
1141           }
1142 #ifdef ADJUST_CUTOFF
1143  _again:
1144 #endif
1145           for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1146                     if (ifd->na_[cpri] == 0)
1147                               continue;
1148                     cl = ifd->active_[cpri];
1149                     ASSERT(cl != NULL);
1150                     do {
1151                               if (!qempty(cl->q_)) {
1152                                         if ((cl->undertime_.tv_sec == 0) ||
1153                                             rmc_under_limit(cl, &now))
1154                                                   goto _prr_out;
1155                                         if (first == NULL && cl->borrow_ != NULL)
1156                                                   first = cl;
1157                               }
1158                               cl = cl->peer_;
1159                     } while (cl != ifd->active_[cpri]);
1160           }
1161 
1162 #ifdef ADJUST_CUTOFF
1163           /*
1164            * no underlimit class found.  if cutoff is taking effect, increase
1165            * cutoff and try again.
1166            */
1167           if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1168                     ifd->cutoff_++;
1169                     goto _again;
1170           }
1171 #endif /* ADJUST_CUTOFF */
1172           /*
1173            * If LINK_EFFICIENCY is turned on, then the first overlimit
1174            * class we encounter will send a packet if all the classes
1175            * of the link-sharing structure are overlimit.
1176            */
1177           reset_cutoff(ifd);
1178           if (!ifd->efficient_ || first == NULL)
1179                     return NULL;
1180 
1181           cl = first;
1182           cpri = cl->pri_;
1183 #if 0     /* too time-consuming for nothing */
1184           if (cl->sleeping_)
1185                     CALLOUT_STOP(&cl->callout_);
1186           cl->sleeping_ = 0;
1187           cl->undertime_.tv_sec = 0;
1188 #endif
1189           ifd->borrowed_[ifd->qi_] = cl->borrow_;
1190           ifd->cutoff_ = cl->borrow_->depth_;
1191 
1192           /*
1193            * Deque the packet and do the book keeping...
1194            */
1195  _prr_out:
1196           if (op == ALTDQ_REMOVE) {
1197                     m = _rmc_getq(cl);
1198                     if (m == NULL)
1199                               panic("_rmc_prr_dequeue_next");
1200                     if (qempty(cl->q_))
1201                               ifd->na_[cpri]--;
1202 
1203                     ifd->active_[cpri] = cl->peer_;
1204 
1205                     ifd->class_[ifd->qi_] = cl;
1206                     ifd->curlen_[ifd->qi_] = m_pktlen(m);
1207                     ifd->now_[ifd->qi_] = now;
1208                     ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1209                     ifd->queued_++;
1210           } else {
1211                     /* mode == ALTDQ_POLL */
1212                     m = _rmc_pollq(cl);
1213                     ifd->pollcache_ = cl;
1214           }
1215           return m;
1216 }
1217 
1218 /*
1219  * mbuf_t *
1220  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timespec *now) - this function
1221  *        is invoked by the packet driver to get the next packet to be
1222  *        dequeued and output on the link.  If WRR is enabled, then the
1223  *        WRR dequeue next routine will determine the next packet to sent.
1224  *        Otherwise, packet-by-packet round robin is invoked.
1225  *
1226  *        Returns:  NULL, if a packet is not available or if all
1227  *                            classes are overlimit.
1228  *
1229  *                            Otherwise, Pointer to the next packet.
1230  */
1231 
1232 mbuf_t *
rmc_dequeue_next(struct rm_ifdat * ifd,int mode)1233 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1234 {
1235           if (ifd->queued_ >= ifd->maxqueued_)
1236                     return NULL;
1237           else if (ifd->wrr_)
1238                     return (_rmc_wrr_dequeue_next(ifd, mode));
1239           else
1240                     return (_rmc_prr_dequeue_next(ifd, mode));
1241 }
1242 
1243 /*
1244  * Update the utilization estimate for the packet that just completed.
1245  * The packet's class & the parent(s) of that class all get their
1246  * estimators updated.  This routine is called by the driver's output-
1247  * packet-completion interrupt service routine.
1248  */
1249 
1250 /*
1251  * a macro to approximate "divide by 1000" that gives 0.000999,
1252  * if a value has enough effective digits.
1253  * (on pentium, mul takes 9 cycles but div takes 46!)
1254  */
1255 #define   NSEC_TO_USEC(t)     (((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1256 /* Don't worry.  Recent compilers don't use div. */
1257 #define   PSEC_TO_USEC(t)     ((t) / 1000 / 1000)
1258 void
rmc_update_class_util(struct rm_ifdat * ifd)1259 rmc_update_class_util(struct rm_ifdat *ifd)
1260 {
1261           int64_t              idle, avgidle, pktlen;
1262           int64_t              pkt_time;
1263           int64_t              tidle;
1264           rm_class_t          *cl, *cl0, *borrowed;
1265           rm_class_t          *borrows;
1266           struct timespec     *nowp;
1267 
1268           /*
1269            * Get the most recent completed class.
1270            */
1271           if ((cl = ifd->class_[ifd->qo_]) == NULL)
1272                     return;
1273 
1274           cl0 = cl;
1275           pktlen = (int64_t)ifd->curlen_[ifd->qo_];
1276           borrowed = ifd->borrowed_[ifd->qo_];
1277           borrows = borrowed;
1278 
1279           PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1280 
1281           /*
1282            * Run estimator on class and its ancestors.
1283            */
1284           /*
1285            * rm_update_class_util is designed to be called when the
1286            * transfer is completed from a xmit complete interrupt,
1287            * but most drivers don't implement an upcall for that.
1288            * so, just use estimated completion time.
1289            * as a result, ifd->qi_ and ifd->qo_ are always synced.
1290            */
1291           nowp = &ifd->now_[ifd->qo_];
1292           /* get pkt_time (for link) in usec */
1293 #if 1  /* use approximation */
1294           pkt_time = (int64_t)ifd->curlen_[ifd->qo_] * (int64_t)ifd->ps_per_byte_;
1295           pkt_time = PSEC_TO_NSEC(pkt_time);
1296 #else
1297           pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1298 #endif
1299           if (ifd->ifq_->altq_ifp->if_type == IFT_PPP) {
1300                     if (TS_LT(nowp, &ifd->ifnow_)) {
1301                               int iftime;
1302 
1303                               /*
1304                                * make sure the estimated completion time does not go
1305                                * too far.  it can happen when the link layer supports
1306                                * data compression or the interface speed is set to
1307                                * a much lower value.
1308                                */
1309                               TS_DELTA(&ifd->ifnow_, nowp, iftime);
1310                               if (iftime+pkt_time < ifd->maxiftime_) {
1311                                         TS_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1312                               } else {
1313                                         TS_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1314                               }
1315                     } else {
1316                               TS_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1317                     }
1318           } else {
1319                     if (TS_LT(nowp, &ifd->ifnow_)) {
1320                               TS_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1321                     } else {
1322                               TS_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1323                     }
1324           }
1325 
1326           while (cl != NULL) {
1327                     TS_DELTA(&ifd->ifnow_, &cl->last_, idle);
1328                     if (idle >= 2000000000)
1329                               /*
1330                                * this class is idle enough, reset avgidle.
1331                                * (TS_DELTA returns 2000000000 ns when delta is large.)
1332                                */
1333                               cl->avgidle_ = cl->maxidle_;
1334 
1335                     /* get pkt_time (for class) in usec */
1336 #if 1  /* use approximation */
1337                     pkt_time = pktlen * (int64_t)cl->ps_per_byte_;
1338                     pkt_time = PSEC_TO_NSEC(pkt_time);
1339 #else
1340                     pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1341 #endif
1342                     idle -= pkt_time;
1343 
1344                     avgidle = cl->avgidle_;
1345                     avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1346                     cl->avgidle_ = avgidle;
1347 
1348                     /* Are we overlimit ? */
1349                     if (avgidle <= 0) {
1350                               CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1351 #if 1 /* ALTQ */
1352                               /*
1353                                * need some lower bound for avgidle, otherwise
1354                                * a borrowing class gets unbounded penalty.
1355                                */
1356                               if (avgidle < cl->minidle_)
1357                                         avgidle = cl->avgidle_ = cl->minidle_;
1358 #endif
1359                               /* set next idle to make avgidle 0 */
1360                               tidle = pkt_time +
1361                                         (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1362                               TS_ADD_DELTA(nowp, tidle, &cl->undertime_);
1363                               ++cl->stats_.over;
1364                     } else {
1365                               cl->avgidle_ =
1366                                   (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1367                               cl->undertime_.tv_sec = 0;
1368                               if (cl->sleeping_) {
1369                                         CALLOUT_STOP(&cl->callout_);
1370                                         cl->sleeping_ = 0;
1371                               }
1372                     }
1373 
1374                     if (borrows != NULL) {
1375                               if (borrows != cl)
1376                                         ++cl->stats_.borrows;
1377                               else
1378                                         borrows = NULL;
1379                     }
1380                     cl->last_ = ifd->ifnow_;
1381                     cl->last_pkttime_ = pkt_time;
1382 
1383 #if 1
1384                     if (cl->parent_ == NULL && cl != cl0) {
1385                               /* take stats of root class */
1386                               PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1387                     }
1388 #endif
1389 
1390                     cl = cl->parent_;
1391           }
1392 
1393           /*
1394            * Check to see if cutoff needs to set to a new level.
1395            */
1396           cl = ifd->class_[ifd->qo_];
1397           if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1398 #if 1 /* ALTQ */
1399                     if ((qlen(cl->q_) <= 0) || TS_LT(nowp, &borrowed->undertime_)) {
1400                               rmc_tl_satisfied(ifd, nowp);
1401                               CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1402                     } else {
1403                               ifd->cutoff_ = borrowed->depth_;
1404                               CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1405                     }
1406 #else /* !ALTQ */
1407                     if ((qlen(cl->q_) <= 1) || TS_LT(&now, &borrowed->undertime_)) {
1408                               reset_cutoff(ifd);
1409 #ifdef notdef
1410                               rmc_tl_satisfied(ifd, &now);
1411 #endif
1412                               CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1413                     } else {
1414                               ifd->cutoff_ = borrowed->depth_;
1415                               CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1416                     }
1417 #endif /* !ALTQ */
1418           }
1419 
1420           /*
1421            * Release class slot
1422            */
1423           ifd->borrowed_[ifd->qo_] = NULL;
1424           ifd->class_[ifd->qo_] = NULL;
1425           ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1426           ifd->queued_--;
1427 }
1428 
1429 /*
1430  * void
1431  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1432  *        over-limit action routines.  These get invoked by rmc_under_limit()
1433  *        if a class with packets to send if over its bandwidth limit & can't
1434  *        borrow from a parent class.
1435  *
1436  *        Returns: NONE
1437  */
1438 
1439 static void
rmc_drop_action(struct rm_class * cl)1440 rmc_drop_action(struct rm_class *cl)
1441 {
1442           struct rm_ifdat     *ifd = cl->ifdat_;
1443 
1444           ASSERT(qlen(cl->q_) > 0);
1445           _rmc_dropq(cl);
1446           if (qempty(cl->q_))
1447                     ifd->na_[cl->pri_]--;
1448 }
1449 
1450 void
rmc_dropall(struct rm_class * cl)1451 rmc_dropall(struct rm_class *cl)
1452 {
1453           struct rm_ifdat     *ifd = cl->ifdat_;
1454 
1455           if (!qempty(cl->q_)) {
1456                     _flushq(cl->q_);
1457 
1458                     ifd->na_[cl->pri_]--;
1459           }
1460 }
1461 
1462 #if (__FreeBSD_version > 300000)
1463 static int tvhzto(struct timeval *);
1464 
1465 static int
tvhzto(struct timeval * tv)1466 tvhzto(struct timeval *tv)
1467 {
1468           struct timeval t2;
1469 
1470           getmicrotime(&t2);
1471           t2.tv_sec = tv->tv_sec - t2.tv_sec;
1472           t2.tv_usec = tv->tv_usec - t2.tv_usec;
1473           return tvtohz(&t2);
1474 }
1475 #endif /* __FreeBSD_version > 300000 */
1476 
1477 /*
1478  * void
1479  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1480  *        delay action routine.  It is invoked via rmc_under_limit when the
1481  *        packet is discovered to be overlimit.
1482  *
1483  *        If the delay action is result of borrow class being overlimit, then
1484  *        delay for the offtime of the borrowing class that is overlimit.
1485  *
1486  *        Returns: NONE
1487  */
1488 
1489 void
rmc_delay_action(struct rm_class * cl,struct rm_class * borrow)1490 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1491 {
1492           int       t;
1493           int64_t   ndelay, extradelay;
1494 
1495           cl->stats_.overactions++;
1496           if (borrow != NULL)
1497                     TS_DELTA(&borrow->undertime_, &cl->overtime_, ndelay);
1498           else
1499                     TS_DELTA(&cl->undertime_, &cl->overtime_, ndelay);
1500 #ifndef BORROW_OFFTIME
1501           ndelay += cl->offtime_;
1502 #endif
1503 
1504           if (!cl->sleeping_) {
1505                     CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1506 #ifdef BORROW_OFFTIME
1507                     if (borrow != NULL)
1508                               extradelay = borrow->offtime_;
1509                     else
1510 #endif
1511                               extradelay = cl->offtime_;
1512 
1513 #ifdef ALTQ
1514                     /*
1515                      * XXX recalculate suspend time:
1516                      * current undertime is (tidle + pkt_time) calculated
1517                      * from the last transmission.
1518                      *        tidle: time required to bring avgidle back to 0
1519                      *        pkt_time: target waiting time for this class
1520                      * we need to replace pkt_time by offtime
1521                      */
1522                     extradelay -= cl->last_pkttime_;
1523 #endif
1524                     if (extradelay > 0) {
1525                               TS_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1526                               ndelay += extradelay;
1527                     }
1528 
1529                     cl->sleeping_ = 1;
1530                     cl->stats_.delays++;
1531 
1532                     /*
1533                      * Since packets are phased randomly with respect to the
1534                      * clock, 1 tick (the next clock tick) can be an arbitrarily
1535                      * short time so we have to wait for at least two ticks.
1536                      * NOTE:  If there's no other traffic, we need the timer as
1537                      * a 'backstop' to restart this class.
1538                      */
1539                     if (NSEC_TO_USEC(ndelay) > tick * 2) {
1540 #ifdef __FreeBSD__
1541                               /* FreeBSD rounds up the tick */
1542                               t = tvhzto(&cl->undertime_);
1543 #else
1544                               /* other BSDs round down the tick */
1545                               t = tshzto(&cl->undertime_) + 1;
1546 #endif
1547                     } else
1548                               t = 2;
1549                     CALLOUT_RESET(&cl->callout_, t,
1550                                     (timeout_t *)rmc_restart, (void *)cl);
1551           }
1552 }
1553 
1554 /*
1555  * void
1556  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1557  *        called by the system timer code & is responsible checking if the
1558  *        class is still sleeping (it might have been restarted as a side
1559  *        effect of the queue scan on a packet arrival) and, if so, restarting
1560  *        output for the class.  Inspecting the class state & restarting output
1561  *        require locking the class structure.  In general the driver is
1562  *        responsible for locking but this is the only routine that is not
1563  *        called directly or indirectly from the interface driver so it has
1564  *        know about system locking conventions.  Under bsd, locking is done
1565  *        by raising IPL to splnet so that's what's implemented here.  On a
1566  *        different system this would probably need to be changed.
1567  *
1568  *        Returns:  NONE
1569  */
1570 
1571 static void
rmc_restart(struct rm_class * cl)1572 rmc_restart(struct rm_class *cl)
1573 {
1574           struct rm_ifdat     *ifd = cl->ifdat_;
1575           int                  s;
1576 
1577           s = splnet();
1578           if (cl->sleeping_) {
1579                     cl->sleeping_ = 0;
1580                     cl->undertime_.tv_sec = 0;
1581 
1582                     if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1583                               CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1584                               (ifd->restart)(ifd->ifq_);
1585                     }
1586           }
1587           splx(s);
1588 }
1589 
1590 /*
1591  * void
1592  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1593  *        handling routine for the root class of the link sharing structure.
1594  *
1595  *        Returns: NONE
1596  */
1597 
1598 static void
rmc_root_overlimit(struct rm_class * cl,struct rm_class * borrow)1599 rmc_root_overlimit(struct rm_class *cl,
1600     struct rm_class *borrow)
1601 {
1602           panic("rmc_root_overlimit");
1603 }
1604 
1605 /*
1606  * Packet Queue handling routines.  Eventually, this is to localize the
1607  *        effects on the code whether queues are red queues or droptail
1608  *        queues.
1609  */
1610 
1611 static int
_rmc_addq(rm_class_t * cl,mbuf_t * m)1612 _rmc_addq(rm_class_t *cl, mbuf_t *m)
1613 {
1614 #ifdef ALTQ_RIO
1615           if (q_is_rio(cl->q_))
1616                     return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1617 #endif
1618 #ifdef ALTQ_RED
1619           if (q_is_red(cl->q_))
1620                     return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1621 #endif /* ALTQ_RED */
1622 
1623           if (cl->flags_ & RMCF_CLEARDSCP)
1624                     write_dsfield(m, cl->pktattr_, 0);
1625 
1626           _addq(cl->q_, m);
1627           return 0;
1628 }
1629 
1630 /* note: _rmc_dropq is not called for red */
1631 static void
_rmc_dropq(rm_class_t * cl)1632 _rmc_dropq(rm_class_t *cl)
1633 {
1634           mbuf_t    *m;
1635 
1636           if ((m = _getq(cl->q_)) != NULL)
1637                     m_freem(m);
1638 }
1639 
1640 static mbuf_t *
_rmc_getq(rm_class_t * cl)1641 _rmc_getq(rm_class_t *cl)
1642 {
1643 #ifdef ALTQ_RIO
1644           if (q_is_rio(cl->q_))
1645                     return rio_getq((rio_t *)cl->red_, cl->q_);
1646 #endif
1647 #ifdef ALTQ_RED
1648           if (q_is_red(cl->q_))
1649                     return red_getq(cl->red_, cl->q_);
1650 #endif
1651           return _getq(cl->q_);
1652 }
1653 
1654 static mbuf_t *
_rmc_pollq(rm_class_t * cl)1655 _rmc_pollq(rm_class_t *cl)
1656 {
1657           return qhead(cl->q_);
1658 }
1659 
1660 #ifdef CBQ_TRACE
1661 
1662 struct cbqtrace                cbqtrace_buffer[NCBQTRACE+1];
1663 struct cbqtrace               *cbqtrace_ptr = NULL;
1664 int                            cbqtrace_count;
1665 
1666 /*
1667  * DDB hook to trace cbq events:
1668  *  the last 1024 events are held in a circular buffer.
1669  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1670  */
1671 void cbqtrace_dump(int);
1672 static char *rmc_funcname(void *);
1673 
1674 static struct rmc_funcs {
1675           void      *func;
1676           char      *name;
1677 } rmc_funcs[] =
1678 {
1679           rmc_init,           "rmc_init",
1680           rmc_queue_packet,   "rmc_queue_packet",
1681           rmc_under_limit,    "rmc_under_limit",
1682           rmc_update_class_util,        "rmc_update_class_util",
1683           rmc_delay_action,   "rmc_delay_action",
1684           rmc_restart,                  "rmc_restart",
1685           _rmc_wrr_dequeue_next,        "_rmc_wrr_dequeue_next",
1686           NULL,                         NULL
1687 };
1688 
1689 static char *
rmc_funcname(void * func)1690 rmc_funcname(void *func)
1691 {
1692           struct rmc_funcs *fp;
1693 
1694           for (fp = rmc_funcs; fp->func != NULL; fp++)
1695                     if (fp->func == func)
1696                               return (fp->name);
1697           return ("unknown");
1698 }
1699 
1700 void
cbqtrace_dump(int counter)1701 cbqtrace_dump(int counter)
1702 {
1703           int        i, *p;
1704           char      *cp;
1705 
1706           counter = counter % NCBQTRACE;
1707           p = (int *)&cbqtrace_buffer[counter];
1708 
1709           for (i=0; i<20; i++) {
1710                     printf("[0x%x] ", *p++);
1711                     printf("%s: ", rmc_funcname((void *)*p++));
1712                     cp = (char *)p++;
1713                     printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1714                     printf("%d\n",*p++);
1715 
1716                     if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1717                               p = (int *)cbqtrace_buffer;
1718           }
1719 }
1720 #endif /* CBQ_TRACE */
1721 #endif /* ALTQ_CBQ */
1722 
1723 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ)
1724 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1725 
1726 void
_addq(class_queue_t * q,mbuf_t * m)1727 _addq(class_queue_t *q, mbuf_t *m)
1728 {
1729         mbuf_t      *m0;
1730 
1731           if ((m0 = qtail(q)) != NULL)
1732                     m->m_nextpkt = m0->m_nextpkt;
1733           else
1734                     m0 = m;
1735           m0->m_nextpkt = m;
1736           qtail(q) = m;
1737           qlen(q)++;
1738 }
1739 
1740 mbuf_t *
_getq(class_queue_t * q)1741 _getq(class_queue_t *q)
1742 {
1743           mbuf_t    *m, *m0;
1744 
1745           if ((m = qtail(q)) == NULL)
1746                     return NULL;
1747           if ((m0 = m->m_nextpkt) != m)
1748                     m->m_nextpkt = m0->m_nextpkt;
1749           else {
1750                     ASSERT(qlen(q) == 1);
1751                     qtail(q) = NULL;
1752           }
1753           qlen(q)--;
1754           m0->m_nextpkt = NULL;
1755           return m0;
1756 }
1757 
1758 /* drop a packet at the tail of the queue */
1759 mbuf_t *
_getq_tail(class_queue_t * q)1760 _getq_tail(class_queue_t *q)
1761 {
1762           mbuf_t    *m, *m0, *prev;
1763 
1764           if ((m = m0 = qtail(q)) == NULL)
1765                     return NULL;
1766           do {
1767                     prev = m0;
1768                     m0 = m0->m_nextpkt;
1769           } while (m0 != m);
1770           prev->m_nextpkt = m->m_nextpkt;
1771           if (prev == m)  {
1772                     ASSERT(qlen(q) == 1);
1773                     qtail(q) = NULL;
1774           } else
1775                     qtail(q) = prev;
1776           qlen(q)--;
1777           m->m_nextpkt = NULL;
1778           return m;
1779 }
1780 
1781 /* randomly select a packet in the queue */
1782 mbuf_t *
_getq_random(class_queue_t * q)1783 _getq_random(class_queue_t *q)
1784 {
1785           struct mbuf         *m;
1786           int                  i, n;
1787 
1788           if ((m = qtail(q)) == NULL)
1789                     return NULL;
1790           if (m->m_nextpkt == m) {
1791                     ASSERT(qlen(q) == 1);
1792                     qtail(q) = NULL;
1793           } else {
1794                     struct mbuf *prev = NULL;
1795 
1796                     n = cprng_fast32() % qlen(q) + 1;
1797                     for (i = 0; i < n; i++) {
1798                               prev = m;
1799                               m = m->m_nextpkt;
1800                     }
1801                     prev->m_nextpkt = m->m_nextpkt;
1802                     if (m == qtail(q))
1803                               qtail(q) = prev;
1804           }
1805           qlen(q)--;
1806           m->m_nextpkt = NULL;
1807           return m;
1808 }
1809 
1810 void
_removeq(class_queue_t * q,mbuf_t * m)1811 _removeq(class_queue_t *q, mbuf_t *m)
1812 {
1813           mbuf_t    *m0, *prev;
1814 
1815           m0 = qtail(q);
1816           do {
1817                     prev = m0;
1818                     m0 = m0->m_nextpkt;
1819           } while (m0 != m);
1820           prev->m_nextpkt = m->m_nextpkt;
1821           if (prev == m)
1822                     qtail(q) = NULL;
1823           else if (qtail(q) == m)
1824                     qtail(q) = prev;
1825           qlen(q)--;
1826 }
1827 
1828 void
_flushq(class_queue_t * q)1829 _flushq(class_queue_t *q)
1830 {
1831           mbuf_t *m;
1832 
1833           while ((m = _getq(q)) != NULL)
1834                     m_freem(m);
1835           ASSERT(qlen(q) == 0);
1836 }
1837 
1838 #endif /* !__GNUC__ || ALTQ_DEBUG */
1839 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1840