1 /*	$OpenBSD: altq_red.c,v 1.11 2003/05/12 00:50:12 henning Exp $	*/
2 /*	$KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $	*/
3 
4 /*
5  * Copyright (C) 1997-2002
6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 /*
31  * Copyright (c) 1990-1994 Regents of the University of California.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. All advertising materials mentioning features or use of this software
43  *    must display the following acknowledgement:
44  *	This product includes software developed by the Computer Systems
45  *	Engineering Group at Lawrence Berkeley Laboratory.
46  * 4. Neither the name of the University nor of the Laboratory may be used
47  *    to endorse or promote products derived from this software without
48  *    specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  */
62 
63 #include <sys/param.h>
64 #include <sys/malloc.h>
65 #include <sys/mbuf.h>
66 #include <sys/socket.h>
67 #include <sys/systm.h>
68 #include <sys/errno.h>
69 
70 #include <net/if.h>
71 #include <net/if_types.h>
72 
73 #include <netinet/in.h>
74 #include <netinet/in_systm.h>
75 #include <netinet/ip.h>
76 #ifdef INET6
77 #include <netinet/ip6.h>
78 #endif
79 
80 #include <net/pfvar.h>
81 #include <altq/altq.h>
82 #include <altq/altq_red.h>
83 
84 /*
85  * ALTQ/RED (Random Early Detection) implementation using 32-bit
86  * fixed-point calculation.
87  *
88  * written by kjc using the ns code as a reference.
89  * you can learn more about red and ns from Sally's home page at
90  * http://www-nrg.ee.lbl.gov/floyd/
91  *
92  * most of the red parameter values are fixed in this implementation
93  * to prevent fixed-point overflow/underflow.
94  * if you change the parameters, watch out for overflow/underflow!
95  *
96  * the parameters used are recommended values by Sally.
97  * the corresponding ns config looks:
98  *	q_weight=0.00195
99  *	minthresh=5 maxthresh=15 queue-size=60
100  *	linterm=30
101  *	dropmech=drop-tail
102  *	bytes=false (can't be handled by 32-bit fixed-point)
103  *	doubleq=false dqthresh=false
104  *	wait=true
105  */
106 /*
107  * alternative red parameters for a slow link.
108  *
109  * assume the queue length becomes from zero to L and keeps L, it takes
110  * N packets for q_avg to reach 63% of L.
111  * when q_weight is 0.002, N is about 500 packets.
112  * for a slow link like dial-up, 500 packets takes more than 1 minute!
113  * when q_weight is 0.008, N is about 127 packets.
114  * when q_weight is 0.016, N is about 63 packets.
115  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
116  * are allowed for 0.016.
117  * see Sally's paper for more details.
118  */
119 /* normal red parameters */
120 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
121 				/* q_weight = 0.00195 */
122 
123 /* red parameters for a slow link */
124 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
125 				/* q_weight = 0.0078125 */
126 
127 /* red parameters for a very slow link (e.g., dialup) */
128 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
129 				/* q_weight = 0.015625 */
130 
131 /* fixed-point uses 12-bit decimal places */
132 #define	FP_SHIFT	12	/* fixed-point shift */
133 
134 /* red parameters for drop probability */
135 #define	INV_P_MAX	10	/* inverse of max drop probability */
136 #define	TH_MIN		5	/* min threshold */
137 #define	TH_MAX		15	/* max threshold */
138 
139 #define	RED_LIMIT	60	/* default max queue length */
140 #define	RED_STATS		/* collect statistics */
141 
142 /*
143  * our default policy for forced-drop is drop-tail.
144  * (in altq-1.1.2 or earlier, the default was random-drop.
145  * but it makes more sense to punish the cause of the surge.)
146  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
147  */
148 
149 /* default red parameter values */
150 static int default_th_min = TH_MIN;
151 static int default_th_max = TH_MAX;
152 static int default_inv_pmax = INV_P_MAX;
153 
154 /*
155  * red support routines
156  */
157 red_t *
red_alloc(int weight,int inv_pmax,int th_min,int th_max,int flags,int pkttime)158 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
159    int pkttime)
160 {
161 	red_t	*rp;
162 	int	 w, i;
163 	int	 npkts_per_sec;
164 
165 	MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
166 	if (rp == NULL)
167 		return (NULL);
168 	bzero(rp, sizeof(red_t));
169 
170 	rp->red_avg = 0;
171 	rp->red_idle = 1;
172 
173 	if (weight == 0)
174 		rp->red_weight = W_WEIGHT;
175 	else
176 		rp->red_weight = weight;
177 	if (inv_pmax == 0)
178 		rp->red_inv_pmax = default_inv_pmax;
179 	else
180 		rp->red_inv_pmax = inv_pmax;
181 	if (th_min == 0)
182 		rp->red_thmin = default_th_min;
183 	else
184 		rp->red_thmin = th_min;
185 	if (th_max == 0)
186 		rp->red_thmax = default_th_max;
187 	else
188 		rp->red_thmax = th_max;
189 
190 	rp->red_flags = flags;
191 
192 	if (pkttime == 0)
193 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
194 		rp->red_pkttime = 800;
195 	else
196 		rp->red_pkttime = pkttime;
197 
198 	if (weight == 0) {
199 		/* when the link is very slow, adjust red parameters */
200 		npkts_per_sec = 1000000 / rp->red_pkttime;
201 		if (npkts_per_sec < 50) {
202 			/* up to about 400Kbps */
203 			rp->red_weight = W_WEIGHT_2;
204 		} else if (npkts_per_sec < 300) {
205 			/* up to about 2.4Mbps */
206 			rp->red_weight = W_WEIGHT_1;
207 		}
208 	}
209 
210 	/* calculate wshift.  weight must be power of 2 */
211 	w = rp->red_weight;
212 	for (i = 0; w > 1; i++)
213 		w = w >> 1;
214 	rp->red_wshift = i;
215 	w = 1 << rp->red_wshift;
216 	if (w != rp->red_weight) {
217 		printf("invalid weight value %d for red! use %d\n",
218 		       rp->red_weight, w);
219 		rp->red_weight = w;
220 	}
221 
222 	/*
223 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
224 	 * to be compared with avg.
225 	 */
226 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
227 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
228 
229 	/*
230 	 * precompute probability denominator
231 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
232 	 */
233 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
234 			 * rp->red_inv_pmax) << FP_SHIFT;
235 
236 	/* allocate weight table */
237 	rp->red_wtab = wtab_alloc(rp->red_weight);
238 
239 	microtime(&rp->red_last);
240 	return (rp);
241 }
242 
243 void
red_destroy(red_t * rp)244 red_destroy(red_t *rp)
245 {
246 	wtab_destroy(rp->red_wtab);
247 	FREE(rp, M_DEVBUF);
248 }
249 
250 void
red_getstats(red_t * rp,struct redstats * sp)251 red_getstats(red_t *rp, struct redstats *sp)
252 {
253 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
254 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
255 	sp->drop_cnt		= rp->red_stats.drop_cnt;
256 	sp->drop_forced		= rp->red_stats.drop_forced;
257 	sp->drop_unforced	= rp->red_stats.drop_unforced;
258 	sp->marked_packets	= rp->red_stats.marked_packets;
259 }
260 
261 int
red_addq(red_t * rp,class_queue_t * q,struct mbuf * m,struct altq_pktattr * pktattr)262 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
263     struct altq_pktattr *pktattr)
264 {
265 	int avg, droptype;
266 	int n;
267 
268 	avg = rp->red_avg;
269 
270 	/*
271 	 * if we were idle, we pretend that n packets arrived during
272 	 * the idle period.
273 	 */
274 	if (rp->red_idle) {
275 		struct timeval now;
276 		int t;
277 
278 		rp->red_idle = 0;
279 		microtime(&now);
280 		t = (now.tv_sec - rp->red_last.tv_sec);
281 		if (t > 60) {
282 			/*
283 			 * being idle for more than 1 minute, set avg to zero.
284 			 * this prevents t from overflow.
285 			 */
286 			avg = 0;
287 		} else {
288 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
289 			n = t / rp->red_pkttime - 1;
290 
291 			/* the following line does (avg = (1 - Wq)^n * avg) */
292 			if (n > 0)
293 				avg = (avg >> FP_SHIFT) *
294 				    pow_w(rp->red_wtab, n);
295 		}
296 	}
297 
298 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
299 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
300 	rp->red_avg = avg;		/* save the new value */
301 
302 	/*
303 	 * red_count keeps a tally of arriving traffic that has not
304 	 * been dropped.
305 	 */
306 	rp->red_count++;
307 
308 	/* see if we drop early */
309 	droptype = DTYPE_NODROP;
310 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
311 		if (avg >= rp->red_thmax_s) {
312 			/* avg >= th_max: forced drop */
313 			droptype = DTYPE_FORCED;
314 		} else if (rp->red_old == 0) {
315 			/* first exceeds th_min */
316 			rp->red_count = 1;
317 			rp->red_old = 1;
318 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
319 				      rp->red_probd, rp->red_count)) {
320 			/* mark or drop by red */
321 			if ((rp->red_flags & REDF_ECN) &&
322 			    mark_ecn(m, pktattr, rp->red_flags)) {
323 				/* successfully marked.  do not drop. */
324 				rp->red_count = 0;
325 #ifdef RED_STATS
326 				rp->red_stats.marked_packets++;
327 #endif
328 			} else {
329 				/* unforced drop by red */
330 				droptype = DTYPE_EARLY;
331 			}
332 		}
333 	} else {
334 		/* avg < th_min */
335 		rp->red_old = 0;
336 	}
337 
338 	/*
339 	 * if the queue length hits the hard limit, it's a forced drop.
340 	 */
341 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
342 		droptype = DTYPE_FORCED;
343 
344 #ifdef RED_RANDOM_DROP
345 	/* if successful or forced drop, enqueue this packet. */
346 	if (droptype != DTYPE_EARLY)
347 		_addq(q, m);
348 #else
349 	/* if successful, enqueue this packet. */
350 	if (droptype == DTYPE_NODROP)
351 		_addq(q, m);
352 #endif
353 	if (droptype != DTYPE_NODROP) {
354 		if (droptype == DTYPE_EARLY) {
355 			/* drop the incoming packet */
356 #ifdef RED_STATS
357 			rp->red_stats.drop_unforced++;
358 #endif
359 		} else {
360 			/* forced drop, select a victim packet in the queue. */
361 #ifdef RED_RANDOM_DROP
362 			m = _getq_random(q);
363 #endif
364 #ifdef RED_STATS
365 			rp->red_stats.drop_forced++;
366 #endif
367 		}
368 #ifdef RED_STATS
369 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
370 #endif
371 		rp->red_count = 0;
372 		m_freem(m);
373 		return (-1);
374 	}
375 	/* successfully queued */
376 #ifdef RED_STATS
377 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
378 #endif
379 	return (0);
380 }
381 
382 /*
383  * early-drop probability is calculated as follows:
384  *   prob = p_max * (avg - th_min) / (th_max - th_min)
385  *   prob_a = prob / (2 - count*prob)
386  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
387  * here prob_a increases as successive undrop count increases.
388  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
389  * becomes 1 when (count >= (2 / prob))).
390  */
391 int
drop_early(int fp_len,int fp_probd,int count)392 drop_early(int fp_len, int fp_probd, int count)
393 {
394 	int	d;		/* denominator of drop-probability */
395 
396 	d = fp_probd - count * fp_len;
397 	if (d <= 0)
398 		/* count exceeds the hard limit: drop or mark */
399 		return (1);
400 
401 	/*
402 	 * now the range of d is [1..600] in fixed-point. (when
403 	 * th_max-th_min=10 and p_max=1/30)
404 	 * drop probability = (avg - TH_MIN) / d
405 	 */
406 
407 	if (arc4random_uniform(d) < fp_len) {
408 		/* drop or mark */
409 		return (1);
410 	}
411 	/* no drop/mark */
412 	return (0);
413 }
414 
415 /*
416  * try to mark CE bit to the packet.
417  *    returns 1 if successfully marked, 0 otherwise.
418  */
419 int
mark_ecn(struct mbuf * m,struct altq_pktattr * pktattr,int flags)420 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
421 {
422 	struct mbuf	*m0;
423 	struct m_tag	*t;
424 	struct altq_tag	*at;
425 	void		*hdr;
426 	int		 af;
427 
428 	t = m_tag_find(m, PACKET_TAG_PF_QID, NULL);
429 	if (t == NULL)
430 		return (0);
431 	at = (struct altq_tag *)(t + 1);
432 
433 	if (at == NULL)
434 		return (0);
435 
436 	af = at->af;
437 	hdr = at->hdr;
438 
439 	if (af != AF_INET && af != AF_INET6)
440 		return (0);
441 
442 	/* verify that pattr_hdr is within the mbuf data */
443 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
444 		if (((caddr_t)hdr >= m0->m_data) &&
445 		    ((caddr_t)hdr < m0->m_data + m0->m_len))
446 			break;
447 	if (m0 == NULL) {
448 		/* ick, tag info is stale */
449 		return (0);
450 	}
451 
452 	switch (af) {
453 	case AF_INET:
454 		if (flags & REDF_ECN4) {
455 			struct ip *ip = hdr;
456 			u_int8_t otos;
457 			int sum;
458 
459 			if (ip->ip_v != 4)
460 				return (0);	/* version mismatch! */
461 
462 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
463 				return (0);	/* not-ECT */
464 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
465 				return (1);	/* already marked */
466 
467 			/*
468 			 * ecn-capable but not marked,
469 			 * mark CE and update checksum
470 			 */
471 			otos = ip->ip_tos;
472 			ip->ip_tos |= IPTOS_ECN_CE;
473 			/*
474 			 * update checksum (from RFC1624)
475 			 *	   HC' = ~(~HC + ~m + m')
476 			 */
477 			sum = ~ntohs(ip->ip_sum) & 0xffff;
478 			sum += (~otos & 0xffff) + ip->ip_tos;
479 			sum = (sum >> 16) + (sum & 0xffff);
480 			sum += (sum >> 16);  /* add carry */
481 			ip->ip_sum = htons(~sum & 0xffff);
482 			return (1);
483 		}
484 		break;
485 #ifdef INET6
486 	case AF_INET6:
487 		if (flags & REDF_ECN6) {
488 			struct ip6_hdr *ip6 = hdr;
489 			u_int32_t flowlabel;
490 
491 			flowlabel = ntohl(ip6->ip6_flow);
492 			if ((flowlabel >> 28) != 6)
493 				return (0);	/* version mismatch! */
494 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
495 			    (IPTOS_ECN_NOTECT << 20))
496 				return (0);	/* not-ECT */
497 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
498 			    (IPTOS_ECN_CE << 20))
499 				return (1);	/* already marked */
500 			/*
501 			 * ecn-capable but not marked,  mark CE
502 			 */
503 			flowlabel |= (IPTOS_ECN_CE << 20);
504 			ip6->ip6_flow = htonl(flowlabel);
505 			return (1);
506 		}
507 		break;
508 #endif  /* INET6 */
509 	}
510 
511 	/* not marked */
512 	return (0);
513 }
514 
515 struct mbuf *
red_getq(rp,q)516 red_getq(rp, q)
517 	red_t *rp;
518 	class_queue_t *q;
519 {
520 	struct mbuf *m;
521 
522 	if ((m = _getq(q)) == NULL) {
523 		if (rp->red_idle == 0) {
524 			rp->red_idle = 1;
525 			microtime(&rp->red_last);
526 		}
527 		return NULL;
528 	}
529 
530 	rp->red_idle = 0;
531 	return (m);
532 }
533 
534 /*
535  * helper routine to calibrate avg during idle.
536  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
537  * here Wq = 1/weight and the code assumes Wq is close to zero.
538  *
539  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
540  */
541 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
542 
543 struct wtab *
wtab_alloc(int weight)544 wtab_alloc(int weight)
545 {
546 	struct wtab	*w;
547 	int		 i;
548 
549 	for (w = wtab_list; w != NULL; w = w->w_next)
550 		if (w->w_weight == weight) {
551 			w->w_refcount++;
552 			return (w);
553 		}
554 
555 	MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
556 	if (w == NULL)
557 		panic("wtab_alloc: malloc failed!");
558 	bzero(w, sizeof(struct wtab));
559 	w->w_weight = weight;
560 	w->w_refcount = 1;
561 	w->w_next = wtab_list;
562 	wtab_list = w;
563 
564 	/* initialize the weight table */
565 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
566 	for (i = 1; i < 32; i++) {
567 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
568 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
569 			w->w_param_max = 1 << i;
570 	}
571 
572 	return (w);
573 }
574 
575 int
wtab_destroy(struct wtab * w)576 wtab_destroy(struct wtab *w)
577 {
578 	struct wtab	*prev;
579 
580 	if (--w->w_refcount > 0)
581 		return (0);
582 
583 	if (wtab_list == w)
584 		wtab_list = w->w_next;
585 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
586 		if (prev->w_next == w) {
587 			prev->w_next = w->w_next;
588 			break;
589 		}
590 
591 	FREE(w, M_DEVBUF);
592 	return (0);
593 }
594 
595 int32_t
pow_w(struct wtab * w,int n)596 pow_w(struct wtab *w, int n)
597 {
598 	int	i, bit;
599 	int32_t	val;
600 
601 	if (n >= w->w_param_max)
602 		return (0);
603 
604 	val = 1 << FP_SHIFT;
605 	if (n <= 0)
606 		return (val);
607 
608 	bit = 1;
609 	i = 0;
610 	while (n) {
611 		if (n & bit) {
612 			val = (val * w->w_tab[i]) >> FP_SHIFT;
613 			n &= ~bit;
614 		}
615 		i++;
616 		bit <<=  1;
617 	}
618 	return (val);
619 }
620