1 /*-
2  * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
3  * Copyright (c) 2008-2010, BitGravity Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  *  1. Redistributions of source code must retain the above copyright notice,
10  *     this list of conditions and the following disclaimer.
11  *
12  *  2. Neither the name of the BitGravity Corporation nor the names of its
13  *     contributors may be used to endorse or promote products derived from
14  *     this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include "opt_route.h"
30 #include "opt_mpath.h"
31 #include "opt_ddb.h"
32 #include "opt_inet.h"
33 #include "opt_inet6.h"
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD: stable/10/sys/net/flowtable.c 281955 2015-04-24 23:26:44Z hiren $");
37 
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/bitstring.h>
41 #include <sys/condvar.h>
42 #include <sys/callout.h>
43 #include <sys/hash.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/pcpu.h>
50 #include <sys/proc.h>
51 #include <sys/queue.h>
52 #include <sys/sbuf.h>
53 #include <sys/sched.h>
54 #include <sys/smp.h>
55 #include <sys/socket.h>
56 #include <sys/syslog.h>
57 #include <sys/sysctl.h>
58 #include <vm/uma.h>
59 
60 #include <net/if.h>
61 #include <net/if_llatbl.h>
62 #include <net/if_var.h>
63 #include <net/route.h>
64 #include <net/flowtable.h>
65 #include <net/vnet.h>
66 
67 #include <netinet/in.h>
68 #include <netinet/in_systm.h>
69 #include <netinet/in_var.h>
70 #include <netinet/if_ether.h>
71 #include <netinet/ip.h>
72 #ifdef INET6
73 #include <netinet/ip6.h>
74 #endif
75 #ifdef FLOWTABLE_HASH_ALL
76 #include <netinet/tcp.h>
77 #include <netinet/udp.h>
78 #include <netinet/sctp.h>
79 #endif
80 
81 #include <ddb/ddb.h>
82 
83 #ifdef	FLOWTABLE_HASH_ALL
84 #define	KEY_PORTS	(sizeof(uint16_t) * 2)
85 #define	KEY_ADDRS	2
86 #else
87 #define	KEY_PORTS	0
88 #define	KEY_ADDRS	1
89 #endif
90 
91 #ifdef	INET6
92 #define	KEY_ADDR_LEN	sizeof(struct in6_addr)
93 #else
94 #define	KEY_ADDR_LEN	sizeof(struct in_addr)
95 #endif
96 
97 #define	KEYLEN	((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
98 
99 struct flentry {
100 	uint32_t		f_hash;		/* hash flowing forward */
101 	uint32_t		f_key[KEYLEN];	/* address(es and ports) */
102 	uint32_t		f_uptime;	/* uptime at last access */
103 	uint16_t		f_fibnum;	/* fib index */
104 #ifdef FLOWTABLE_HASH_ALL
105 	uint8_t			f_proto;	/* protocol */
106 	uint8_t			f_flags;	/* stale? */
107 #define FL_STALE 		1
108 #endif
109 	SLIST_ENTRY(flentry)	f_next;		/* pointer to collision entry */
110 	struct rtentry		*f_rt;		/* rtentry for flow */
111 	struct llentry		*f_lle;		/* llentry for flow */
112 };
113 #undef KEYLEN
114 
115 SLIST_HEAD(flist, flentry);
116 /* Make sure we can use pcpu_zone_ptr for struct flist. */
117 CTASSERT(sizeof(struct flist) == sizeof(void *));
118 
119 struct flowtable {
120 	counter_u64_t	*ft_stat;
121 	int 		ft_size;
122 	/*
123 	 * ft_table is a malloc(9)ed array of pointers.  Pointers point to
124 	 * memory from UMA_ZONE_PCPU zone.
125 	 * ft_masks is per-cpu pointer itself.  Each instance points
126 	 * to a malloc(9)ed bitset, that is private to corresponding CPU.
127 	 */
128 	struct flist	**ft_table;
129 	bitstr_t 	**ft_masks;
130 	bitstr_t	*ft_tmpmask;
131 };
132 
133 #define	FLOWSTAT_ADD(ft, name, v)	\
134 	counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
135 #define	FLOWSTAT_INC(ft, name)	FLOWSTAT_ADD(ft, name, 1)
136 
137 static struct proc *flowcleanerproc;
138 static uint32_t flow_hashjitter;
139 
140 static struct cv 	flowclean_f_cv;
141 static struct cv 	flowclean_c_cv;
142 static struct mtx	flowclean_lock;
143 static uint32_t		flowclean_cycles;
144 
145 /*
146  * TODO:
147  * - add sysctls to resize && flush flow tables
148  * - Add per flowtable sysctls for statistics and configuring timeouts
149  * - add saturation counter to rtentry to support per-packet load-balancing
150  *   add flag to indicate round-robin flow, add list lookup from head
151      for flows
152  * - add sysctl / device node / syscall to support exporting and importing
153  *   of flows with flag to indicate that a flow was imported so should
154  *   not be considered for auto-cleaning
155  * - support explicit connection state (currently only ad-hoc for DSR)
156  * - idetach() cleanup for options VIMAGE builds.
157  */
158 #ifdef INET
159 static VNET_DEFINE(struct flowtable, ip4_ft);
160 #define	V_ip4_ft	VNET(ip4_ft)
161 #endif
162 #ifdef INET6
163 static VNET_DEFINE(struct flowtable, ip6_ft);
164 #define	V_ip6_ft	VNET(ip6_ft)
165 #endif
166 
167 static uma_zone_t flow_zone;
168 
169 static VNET_DEFINE(int, flowtable_enable) = 1;
170 #define	V_flowtable_enable		VNET(flowtable_enable)
171 
172 static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
173     "flowtable");
174 SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
175     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
176 SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
177     &flow_zone, "Maximum number of flows allowed");
178 
179 static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
180 
181 static struct flentry *
182 flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
183 
184 #ifdef INET
185 static struct flentry *
flowtable_lookup_ipv4(struct mbuf * m,struct route * ro)186 flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
187 {
188 	struct flentry *fle;
189 	struct sockaddr_in *sin;
190 	struct ip *ip;
191 	uint32_t fibnum;
192 #ifdef FLOWTABLE_HASH_ALL
193 	uint32_t key[3];
194 	int iphlen;
195 	uint16_t sport, dport;
196 	uint8_t proto;
197 #endif
198 
199 	ip = mtod(m, struct ip *);
200 
201 	if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
202 	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
203 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
204 		return (NULL);
205 
206 	fibnum = M_GETFIB(m);
207 
208 #ifdef FLOWTABLE_HASH_ALL
209 	iphlen = ip->ip_hl << 2;
210 	proto = ip->ip_p;
211 
212 	switch (proto) {
213 	case IPPROTO_TCP: {
214 		struct tcphdr *th;
215 
216 		th = (struct tcphdr *)((char *)ip + iphlen);
217 		sport = th->th_sport;
218 		dport = th->th_dport;
219 		if (th->th_flags & (TH_RST|TH_FIN))
220 			fibnum |= (FL_STALE << 24);
221 		break;
222 	}
223 	case IPPROTO_UDP: {
224 		struct udphdr *uh;
225 
226 		uh = (struct udphdr *)((char *)ip + iphlen);
227 		sport = uh->uh_sport;
228 		dport = uh->uh_dport;
229 		break;
230 	}
231 	case IPPROTO_SCTP: {
232 		struct sctphdr *sh;
233 
234 		sh = (struct sctphdr *)((char *)ip + iphlen);
235 		sport = sh->src_port;
236 		dport = sh->dest_port;
237 		/* XXXGL: handle stale? */
238 		break;
239 	}
240 	default:
241 		sport = dport = 0;
242 		break;
243 	}
244 
245 	key[0] = ip->ip_dst.s_addr;
246 	key[1] = ip->ip_src.s_addr;
247 	key[2] = (dport << 16) | sport;
248 	fibnum |= proto << 16;
249 
250 	fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
251 	    fibnum);
252 
253 #else	/* !FLOWTABLE_HASH_ALL */
254 
255 	fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
256 	    sizeof(struct in_addr), fibnum);
257 
258 #endif	/* FLOWTABLE_HASH_ALL */
259 
260 	if (fle == NULL)
261 		return (NULL);
262 
263 	sin = (struct sockaddr_in *)&ro->ro_dst;
264 	sin->sin_family = AF_INET;
265 	sin->sin_len = sizeof(*sin);
266 	sin->sin_addr = ip->ip_dst;
267 
268 	return (fle);
269 }
270 #endif /* INET */
271 
272 #ifdef INET6
273 /*
274  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
275  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
276  * pointer might become stale after other pullups (but we never use it
277  * this way).
278  */
279 #define PULLUP_TO(_len, p, T)						\
280 do {									\
281 	int x = (_len) + sizeof(T);					\
282 	if ((m)->m_len < x)						\
283 		return (NULL);						\
284 	p = (mtod(m, char *) + (_len));					\
285 } while (0)
286 
287 #define	TCP(p)		((struct tcphdr *)(p))
288 #define	SCTP(p)		((struct sctphdr *)(p))
289 #define	UDP(p)		((struct udphdr *)(p))
290 
291 static struct flentry *
flowtable_lookup_ipv6(struct mbuf * m,struct route * ro)292 flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
293 {
294 	struct flentry *fle;
295 	struct sockaddr_in6 *sin6;
296 	struct ip6_hdr *ip6;
297 	uint32_t fibnum;
298 #ifdef FLOWTABLE_HASH_ALL
299 	uint32_t key[9];
300 	void *ulp;
301 	int hlen;
302 	uint16_t sport, dport;
303 	u_short offset;
304 	uint8_t proto;
305 #else
306 	uint32_t key[4];
307 #endif
308 
309 	ip6 = mtod(m, struct ip6_hdr *);
310 	if (in6_localaddr(&ip6->ip6_dst))
311 		return (NULL);
312 
313 	fibnum = M_GETFIB(m);
314 
315 #ifdef	FLOWTABLE_HASH_ALL
316 	hlen = sizeof(struct ip6_hdr);
317 	proto = ip6->ip6_nxt;
318 	offset = sport = dport = 0;
319 	ulp = NULL;
320 	while (ulp == NULL) {
321 		switch (proto) {
322 		case IPPROTO_ICMPV6:
323 		case IPPROTO_OSPFIGP:
324 		case IPPROTO_PIM:
325 		case IPPROTO_CARP:
326 		case IPPROTO_ESP:
327 		case IPPROTO_NONE:
328 			ulp = ip6;
329 			break;
330 		case IPPROTO_TCP:
331 			PULLUP_TO(hlen, ulp, struct tcphdr);
332 			dport = TCP(ulp)->th_dport;
333 			sport = TCP(ulp)->th_sport;
334 			if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
335 				fibnum |= (FL_STALE << 24);
336 			break;
337 		case IPPROTO_SCTP:
338 			PULLUP_TO(hlen, ulp, struct sctphdr);
339 			dport = SCTP(ulp)->src_port;
340 			sport = SCTP(ulp)->dest_port;
341 			/* XXXGL: handle stale? */
342 			break;
343 		case IPPROTO_UDP:
344 			PULLUP_TO(hlen, ulp, struct udphdr);
345 			dport = UDP(ulp)->uh_dport;
346 			sport = UDP(ulp)->uh_sport;
347 			break;
348 		case IPPROTO_HOPOPTS:	/* RFC 2460 */
349 			PULLUP_TO(hlen, ulp, struct ip6_hbh);
350 			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
351 			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
352 			ulp = NULL;
353 			break;
354 		case IPPROTO_ROUTING:	/* RFC 2460 */
355 			PULLUP_TO(hlen, ulp, struct ip6_rthdr);
356 			hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
357 			proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
358 			ulp = NULL;
359 			break;
360 		case IPPROTO_FRAGMENT:	/* RFC 2460 */
361 			PULLUP_TO(hlen, ulp, struct ip6_frag);
362 			hlen += sizeof (struct ip6_frag);
363 			proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
364 			offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
365 			    IP6F_OFF_MASK;
366 			ulp = NULL;
367 			break;
368 		case IPPROTO_DSTOPTS:	/* RFC 2460 */
369 			PULLUP_TO(hlen, ulp, struct ip6_hbh);
370 			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
371 			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
372 			ulp = NULL;
373 			break;
374 		case IPPROTO_AH:	/* RFC 2402 */
375 			PULLUP_TO(hlen, ulp, struct ip6_ext);
376 			hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
377 			proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
378 			ulp = NULL;
379 			break;
380 		default:
381 			PULLUP_TO(hlen, ulp, struct ip6_ext);
382 			break;
383 		}
384 	}
385 
386 	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
387 	bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
388 	key[8] = (dport << 16) | sport;
389 	fibnum |= proto << 16;
390 
391 	fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
392 	    fibnum);
393 #else	/* !FLOWTABLE_HASH_ALL */
394 	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
395 	fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
396 	    fibnum);
397 #endif	/* FLOWTABLE_HASH_ALL */
398 
399 	if (fle == NULL)
400 		return (NULL);
401 
402 	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
403 	sin6->sin6_family = AF_INET6;
404 	sin6->sin6_len = sizeof(*sin6);
405 	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
406 
407 	return (fle);
408 }
409 #endif /* INET6 */
410 
411 static bitstr_t *
flowtable_mask(struct flowtable * ft)412 flowtable_mask(struct flowtable *ft)
413 {
414 
415 	/*
416 	 * flowtable_free_stale() calls w/o critical section, but
417 	 * with sched_bind(). Since pointer is stable throughout
418 	 * ft lifetime, it is safe, otherwise...
419 	 *
420 	 * CRITICAL_ASSERT(curthread);
421 	 */
422 
423 	return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
424 }
425 
426 static struct flist *
flowtable_list(struct flowtable * ft,uint32_t hash)427 flowtable_list(struct flowtable *ft, uint32_t hash)
428 {
429 
430 	CRITICAL_ASSERT(curthread);
431 	return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
432 }
433 
434 static int
flow_stale(struct flowtable * ft,struct flentry * fle,int maxidle)435 flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
436 {
437 
438 	if (((fle->f_rt->rt_flags & RTF_HOST) &&
439 	    ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
440 	    (fle->f_rt->rt_ifp == NULL) ||
441 	    !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
442 	    (fle->f_lle->la_flags & LLE_VALID) == 0)
443 		return (1);
444 
445 	if (time_uptime - fle->f_uptime > maxidle)
446 		return (1);
447 
448 #ifdef FLOWTABLE_HASH_ALL
449 	if (fle->f_flags & FL_STALE)
450 		return (1);
451 #endif
452 
453 	return (0);
454 }
455 
456 static int
flow_full(void)457 flow_full(void)
458 {
459 	int count, max;
460 
461 	count = uma_zone_get_cur(flow_zone);
462 	max = uma_zone_get_max(flow_zone);
463 
464 	return (count > (max - (max >> 3)));
465 }
466 
467 static int
flow_matches(struct flentry * fle,uint32_t * key,int keylen,uint32_t fibnum)468 flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
469 {
470 #ifdef FLOWTABLE_HASH_ALL
471 	uint8_t proto;
472 
473 	proto = (fibnum >> 16) & 0xff;
474 	fibnum &= 0xffff;
475 #endif
476 
477 	CRITICAL_ASSERT(curthread);
478 
479 	/* Microoptimization for IPv4: don't use bcmp(). */
480 	if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
481 	    (bcmp(fle->f_key, key, keylen) == 0)) &&
482 	    fibnum == fle->f_fibnum &&
483 #ifdef FLOWTABLE_HASH_ALL
484 	    proto == fle->f_proto &&
485 #endif
486 	    (fle->f_rt->rt_flags & RTF_UP) &&
487 	    fle->f_rt->rt_ifp != NULL &&
488 	    (fle->f_lle->la_flags & LLE_VALID))
489 		return (1);
490 
491 	return (0);
492 }
493 
494 static struct flentry *
flowtable_insert(struct flowtable * ft,uint32_t hash,uint32_t * key,int keylen,uint32_t fibnum0)495 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
496     int keylen, uint32_t fibnum0)
497 {
498 #ifdef INET6
499 	struct route_in6 sro6;
500 #endif
501 #ifdef INET
502 	struct route sro;
503 #endif
504 	struct route *ro = NULL;
505 	struct rtentry *rt;
506 	struct lltable *lt = NULL;
507 	struct llentry *lle;
508 	struct sockaddr_storage *l3addr;
509 	struct ifnet *ifp;
510 	struct flist *flist;
511 	struct flentry *fle, *iter;
512 	bitstr_t *mask;
513 	uint16_t fibnum = fibnum0;
514 #ifdef FLOWTABLE_HASH_ALL
515 	uint8_t proto;
516 
517 	proto = (fibnum0 >> 16) & 0xff;
518 	fibnum = fibnum0 & 0xffff;
519 #endif
520 
521 	/*
522 	 * This bit of code ends up locking the
523 	 * same route 3 times (just like ip_output + ether_output)
524 	 * - at lookup
525 	 * - in rt_check when called by arpresolve
526 	 * - dropping the refcount for the rtentry
527 	 *
528 	 * This could be consolidated to one if we wrote a variant
529 	 * of arpresolve with an rt_check variant that expected to
530 	 * receive the route locked
531 	 */
532 #ifdef INET
533 	if (ft == &V_ip4_ft) {
534 		struct sockaddr_in *sin;
535 
536 		ro = &sro;
537 		bzero(&sro.ro_dst, sizeof(sro.ro_dst));
538 
539 		sin = (struct sockaddr_in *)&sro.ro_dst;
540 		sin->sin_family = AF_INET;
541 		sin->sin_len = sizeof(*sin);
542 		sin->sin_addr.s_addr = key[0];
543 	}
544 #endif
545 #ifdef INET6
546 	if (ft == &V_ip6_ft) {
547 		struct sockaddr_in6 *sin6;
548 
549 		ro = (struct route *)&sro6;
550 		sin6 = &sro6.ro_dst;
551 
552 		bzero(sin6, sizeof(*sin6));
553 		sin6->sin6_family = AF_INET6;
554 		sin6->sin6_len = sizeof(*sin6);
555 		bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
556 	}
557 #endif
558 
559 	ro->ro_rt = NULL;
560 #ifdef RADIX_MPATH
561 	rtalloc_mpath_fib(ro, hash, fibnum);
562 #else
563 	rtalloc_ign_fib(ro, 0, fibnum);
564 #endif
565 	if (ro->ro_rt == NULL)
566 		return (NULL);
567 
568 	rt = ro->ro_rt;
569 	ifp = rt->rt_ifp;
570 
571 	if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
572 		RTFREE(rt);
573 		return (NULL);
574 	}
575 
576 #ifdef INET
577 	if (ft == &V_ip4_ft)
578 		lt = LLTABLE(ifp);
579 #endif
580 #ifdef INET6
581 	if (ft == &V_ip6_ft)
582 		lt = LLTABLE6(ifp);
583 #endif
584 
585 	if (rt->rt_flags & RTF_GATEWAY)
586 		l3addr = (struct sockaddr_storage *)rt->rt_gateway;
587 	else
588 		l3addr = (struct sockaddr_storage *)&ro->ro_dst;
589 	lle = llentry_alloc(ifp, lt, l3addr);
590 
591 	if (lle == NULL) {
592 		RTFREE(rt);
593 		return (NULL);
594 	}
595 
596 	/* Don't insert the entry if the ARP hasn't yet finished resolving. */
597 	if ((lle->la_flags & LLE_VALID) == 0) {
598 		RTFREE(rt);
599 		LLE_FREE(lle);
600 		FLOWSTAT_INC(ft, ft_fail_lle_invalid);
601 		return (NULL);
602 	}
603 
604 	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
605 	if (fle == NULL) {
606 		RTFREE(rt);
607 		LLE_FREE(lle);
608 		return (NULL);
609 	}
610 
611 	fle->f_hash = hash;
612 	bcopy(key, &fle->f_key, keylen);
613 	fle->f_rt = rt;
614 	fle->f_lle = lle;
615 	fle->f_fibnum = fibnum;
616 	fle->f_uptime = time_uptime;
617 #ifdef FLOWTABLE_HASH_ALL
618 	fle->f_proto = proto;
619 	fle->f_flags = fibnum0 >> 24;
620 #endif
621 
622 	critical_enter();
623 	mask = flowtable_mask(ft);
624 	flist = flowtable_list(ft, hash);
625 
626 	if (SLIST_EMPTY(flist)) {
627 		bit_set(mask, (hash % ft->ft_size));
628 		SLIST_INSERT_HEAD(flist, fle, f_next);
629 		goto skip;
630 	}
631 
632 	/*
633 	 * find end of list and make sure that we were not
634 	 * preempted by another thread handling this flow
635 	 */
636 	SLIST_FOREACH(iter, flist, f_next) {
637 		KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
638 		    ("%s: wrong hash", __func__));
639 		if (flow_matches(iter, key, keylen, fibnum)) {
640 			/*
641 			 * We probably migrated to an other CPU after
642 			 * lookup in flowtable_lookup_common() failed.
643 			 * It appeared that this CPU already has flow
644 			 * entry.
645 			 */
646 			iter->f_uptime = time_uptime;
647 #ifdef FLOWTABLE_HASH_ALL
648 			iter->f_flags |= fibnum >> 24;
649 #endif
650 			critical_exit();
651 			FLOWSTAT_INC(ft, ft_collisions);
652 			uma_zfree(flow_zone, fle);
653 			return (iter);
654 		}
655 	}
656 
657 	SLIST_INSERT_HEAD(flist, fle, f_next);
658 skip:
659 	critical_exit();
660 	FLOWSTAT_INC(ft, ft_inserts);
661 
662 	return (fle);
663 }
664 
665 int
flowtable_lookup(sa_family_t sa,struct mbuf * m,struct route * ro)666 flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
667 {
668 	struct flentry *fle;
669 
670 	if (V_flowtable_enable == 0)
671 		return (ENXIO);
672 
673 	switch (sa) {
674 #ifdef INET
675 	case AF_INET:
676 		fle = flowtable_lookup_ipv4(m, ro);
677 		break;
678 #endif
679 #ifdef INET6
680 	case AF_INET6:
681 		fle = flowtable_lookup_ipv6(m, ro);
682 		break;
683 #endif
684 	default:
685 		panic("%s: sa %d", __func__, sa);
686 	}
687 
688 	if (fle == NULL)
689 		return (EHOSTUNREACH);
690 
691 	if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
692 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
693 		m->m_pkthdr.flowid = fle->f_hash;
694 	}
695 
696 	ro->ro_rt = fle->f_rt;
697 	ro->ro_lle = fle->f_lle;
698 	ro->ro_flags |= RT_NORTREF;
699 
700 	return (0);
701 }
702 
703 static struct flentry *
flowtable_lookup_common(struct flowtable * ft,uint32_t * key,int keylen,uint32_t fibnum)704 flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
705     uint32_t fibnum)
706 {
707 	struct flist *flist;
708 	struct flentry *fle;
709 	uint32_t hash;
710 
711 	FLOWSTAT_INC(ft, ft_lookups);
712 
713 	hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
714 
715 	critical_enter();
716 	flist = flowtable_list(ft, hash);
717 	SLIST_FOREACH(fle, flist, f_next) {
718 		KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
719 		    ("%s: wrong hash", __func__));
720 		if (flow_matches(fle, key, keylen, fibnum)) {
721 			fle->f_uptime = time_uptime;
722 #ifdef FLOWTABLE_HASH_ALL
723 			fle->f_flags |= fibnum >> 24;
724 #endif
725 			critical_exit();
726 			FLOWSTAT_INC(ft, ft_hits);
727 			return (fle);
728 		}
729 	}
730 	critical_exit();
731 
732 	FLOWSTAT_INC(ft, ft_misses);
733 
734 	return (flowtable_insert(ft, hash, key, keylen, fibnum));
735 }
736 
737 /*
738  * used by the bit_alloc macro
739  */
740 #define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
741 static void
flowtable_alloc(struct flowtable * ft)742 flowtable_alloc(struct flowtable *ft)
743 {
744 
745 	ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
746 	    M_FTABLE, M_WAITOK);
747 	for (int i = 0; i < ft->ft_size; i++)
748 		ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
749 
750 	ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
751 	for (int i = 0; i < mp_ncpus; i++) {
752 		bitstr_t **b;
753 
754 		b = zpcpu_get_cpu(ft->ft_masks, i);
755 		*b = bit_alloc(ft->ft_size);
756 	}
757 	ft->ft_tmpmask = bit_alloc(ft->ft_size);
758 }
759 #undef calloc
760 
761 static void
flowtable_free_stale(struct flowtable * ft,struct rtentry * rt,int maxidle)762 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
763 {
764 	struct flist *flist, freelist;
765 	struct flentry *fle, *fle1, *fleprev;
766 	bitstr_t *mask, *tmpmask;
767 	int curbit, tmpsize;
768 
769 	SLIST_INIT(&freelist);
770 	mask = flowtable_mask(ft);
771 	tmpmask = ft->ft_tmpmask;
772 	tmpsize = ft->ft_size;
773 	memcpy(tmpmask, mask, ft->ft_size/8);
774 	curbit = 0;
775 	fleprev = NULL; /* pacify gcc */
776 	/*
777 	 * XXX Note to self, bit_ffs operates at the byte level
778 	 * and thus adds gratuitous overhead
779 	 */
780 	bit_ffs(tmpmask, ft->ft_size, &curbit);
781 	while (curbit != -1) {
782 		if (curbit >= ft->ft_size || curbit < -1) {
783 			log(LOG_ALERT,
784 			    "warning: bad curbit value %d \n",
785 			    curbit);
786 			break;
787 		}
788 
789 		FLOWSTAT_INC(ft, ft_free_checks);
790 
791 		critical_enter();
792 		flist = flowtable_list(ft, curbit);
793 #ifdef DIAGNOSTIC
794 		if (SLIST_EMPTY(flist) && curbit > 0) {
795 			log(LOG_ALERT,
796 			    "warning bit=%d set, but no fle found\n",
797 			    curbit);
798 		}
799 #endif
800 		SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
801 			if (rt != NULL && fle->f_rt != rt) {
802 				fleprev = fle;
803 				continue;
804 			}
805 			if (!flow_stale(ft, fle, maxidle)) {
806 				fleprev = fle;
807 				continue;
808 			}
809 
810 			if (fle == SLIST_FIRST(flist))
811 				SLIST_REMOVE_HEAD(flist, f_next);
812 			else
813 				SLIST_REMOVE_AFTER(fleprev, f_next);
814 			SLIST_INSERT_HEAD(&freelist, fle, f_next);
815 		}
816 		if (SLIST_EMPTY(flist))
817 			bit_clear(mask, curbit);
818 		critical_exit();
819 
820 		bit_clear(tmpmask, curbit);
821 		tmpmask += (curbit / 8);
822 		tmpsize -= (curbit / 8) * 8;
823 		bit_ffs(tmpmask, tmpsize, &curbit);
824 	}
825 
826 	SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
827 		FLOWSTAT_INC(ft, ft_frees);
828 		if (fle->f_rt != NULL)
829 			RTFREE(fle->f_rt);
830 		if (fle->f_lle != NULL)
831 			LLE_FREE(fle->f_lle);
832 		uma_zfree(flow_zone, fle);
833 	}
834 }
835 
836 static void
flowtable_clean_vnet(struct flowtable * ft,struct rtentry * rt,int maxidle)837 flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
838 {
839 	int i;
840 
841 	CPU_FOREACH(i) {
842 		if (smp_started == 1) {
843 			thread_lock(curthread);
844 			sched_bind(curthread, i);
845 			thread_unlock(curthread);
846 		}
847 
848 		flowtable_free_stale(ft, rt, maxidle);
849 
850 		if (smp_started == 1) {
851 			thread_lock(curthread);
852 			sched_unbind(curthread);
853 			thread_unlock(curthread);
854 		}
855 	}
856 }
857 
858 void
flowtable_route_flush(sa_family_t sa,struct rtentry * rt)859 flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
860 {
861 	struct flowtable *ft;
862 
863 	switch (sa) {
864 #ifdef INET
865 	case AF_INET:
866 		ft = &V_ip4_ft;
867 		break;
868 #endif
869 #ifdef INET6
870 	case AF_INET6:
871 		ft = &V_ip6_ft;
872 		break;
873 #endif
874 	default:
875 		panic("%s: sa %d", __func__, sa);
876 	}
877 
878 	flowtable_clean_vnet(ft, rt, 0);
879 }
880 
881 static void
flowtable_cleaner(void)882 flowtable_cleaner(void)
883 {
884 	VNET_ITERATOR_DECL(vnet_iter);
885 	struct thread *td;
886 
887 	if (bootverbose)
888 		log(LOG_INFO, "flowtable cleaner started\n");
889 	td = curthread;
890 	while (1) {
891 		uint32_t flowclean_freq, maxidle;
892 
893 		/*
894 		 * The maximum idle time, as well as frequency are arbitrary.
895 		 */
896 		if (flow_full())
897 			maxidle = 5;
898 		else
899 			maxidle = 30;
900 
901 		VNET_LIST_RLOCK();
902 		VNET_FOREACH(vnet_iter) {
903 			CURVNET_SET(vnet_iter);
904 #ifdef INET
905 			flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
906 #endif
907 #ifdef INET6
908 			flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
909 #endif
910 			CURVNET_RESTORE();
911 		}
912 		VNET_LIST_RUNLOCK();
913 
914 		if (flow_full())
915 			flowclean_freq = 4*hz;
916 		else
917 			flowclean_freq = 20*hz;
918 		mtx_lock(&flowclean_lock);
919 		thread_lock(td);
920 		sched_prio(td, PPAUSE);
921 		thread_unlock(td);
922 		flowclean_cycles++;
923 		cv_broadcast(&flowclean_f_cv);
924 		cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
925 		mtx_unlock(&flowclean_lock);
926 	}
927 }
928 
929 static void
flowtable_flush(void * unused __unused)930 flowtable_flush(void *unused __unused)
931 {
932 	uint64_t start;
933 
934 	mtx_lock(&flowclean_lock);
935 	start = flowclean_cycles;
936 	while (start == flowclean_cycles) {
937 		cv_broadcast(&flowclean_c_cv);
938 		cv_wait(&flowclean_f_cv, &flowclean_lock);
939 	}
940 	mtx_unlock(&flowclean_lock);
941 }
942 
943 static struct kproc_desc flow_kp = {
944 	"flowcleaner",
945 	flowtable_cleaner,
946 	&flowcleanerproc
947 };
948 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
949 
950 static int
flowtable_get_size(char * name)951 flowtable_get_size(char *name)
952 {
953 	int size;
954 
955 	if (TUNABLE_INT_FETCH(name, &size)) {
956 		if (size < 256)
957 			size = 256;
958 		if (!powerof2(size)) {
959 			printf("%s must be power of 2\n", name);
960 			size = 2048;
961 		}
962 	} else {
963 		/*
964 		 * round up to the next power of 2
965 		 */
966 		size = 1 << fls((1024 + maxusers * 64) - 1);
967 	}
968 
969 	return (size);
970 }
971 
972 static void
flowtable_init(const void * unused __unused)973 flowtable_init(const void *unused __unused)
974 {
975 
976 	flow_hashjitter = arc4random();
977 
978 	flow_zone = uma_zcreate("flows", sizeof(struct flentry),
979 	    NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
980 	uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
981 
982 	cv_init(&flowclean_c_cv, "c_flowcleanwait");
983 	cv_init(&flowclean_f_cv, "f_flowcleanwait");
984 	mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
985 	EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
986 	    EVENTHANDLER_PRI_ANY);
987 }
988 SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
989     flowtable_init, NULL);
990 
991 #ifdef INET
992 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
993     "Flowtable for IPv4");
994 
995 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
996 VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
997 VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
998 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
999     ip4_ftstat, "Flowtable statistics for IPv4 "
1000     "(struct flowtable_stat, net/flowtable.h)");
1001 
1002 static void
flowtable_init_vnet_v4(const void * unused __unused)1003 flowtable_init_vnet_v4(const void *unused __unused)
1004 {
1005 
1006 	V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
1007 	V_ip4_ft.ft_stat = VNET(ip4_ftstat);
1008 	flowtable_alloc(&V_ip4_ft);
1009 }
1010 VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1011     flowtable_init_vnet_v4, NULL);
1012 #endif /* INET */
1013 
1014 #ifdef INET6
1015 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
1016     "Flowtable for IPv6");
1017 
1018 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
1019 VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
1020 VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
1021 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
1022     ip6_ftstat, "Flowtable statistics for IPv6 "
1023     "(struct flowtable_stat, net/flowtable.h)");
1024 
1025 static void
flowtable_init_vnet_v6(const void * unused __unused)1026 flowtable_init_vnet_v6(const void *unused __unused)
1027 {
1028 
1029 	V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
1030 	V_ip6_ft.ft_stat = VNET(ip6_ftstat);
1031 	flowtable_alloc(&V_ip6_ft);
1032 }
1033 VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1034     flowtable_init_vnet_v6, NULL);
1035 #endif /* INET6 */
1036 
1037 #ifdef DDB
1038 static bitstr_t *
flowtable_mask_pcpu(struct flowtable * ft,int cpuid)1039 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1040 {
1041 
1042 	return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
1043 }
1044 
1045 static struct flist *
flowtable_list_pcpu(struct flowtable * ft,uint32_t hash,int cpuid)1046 flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1047 {
1048 
1049 	return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
1050 }
1051 
1052 static void
flow_show(struct flowtable * ft,struct flentry * fle)1053 flow_show(struct flowtable *ft, struct flentry *fle)
1054 {
1055 	int idle_time;
1056 	int rt_valid, ifp_valid;
1057 	volatile struct rtentry *rt;
1058 	struct ifnet *ifp = NULL;
1059 	uint32_t *hashkey = fle->f_key;
1060 
1061 	idle_time = (int)(time_uptime - fle->f_uptime);
1062 	rt = fle->f_rt;
1063 	rt_valid = rt != NULL;
1064 	if (rt_valid)
1065 		ifp = rt->rt_ifp;
1066 	ifp_valid = ifp != NULL;
1067 
1068 #ifdef INET
1069 	if (ft == &V_ip4_ft) {
1070 		char daddr[4*sizeof "123"];
1071 #ifdef FLOWTABLE_HASH_ALL
1072 		char saddr[4*sizeof "123"];
1073 		uint16_t sport, dport;
1074 #endif
1075 
1076 		inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
1077 #ifdef FLOWTABLE_HASH_ALL
1078 		inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1079 		dport = ntohs((uint16_t)(hashkey[2] >> 16));
1080 		sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
1081 		db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
1082 #else
1083 		db_printf("%s ", daddr);
1084 #endif
1085 	}
1086 #endif /* INET */
1087 #ifdef INET6
1088 	if (ft == &V_ip6_ft) {
1089 #ifdef FLOWTABLE_HASH_ALL
1090 		db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1091 		    hashkey[0], hashkey[1], hashkey[2],
1092 		    hashkey[3], hashkey[4], hashkey[5],
1093 		    hashkey[6], hashkey[7], hashkey[8]);
1094 #else
1095 		db_printf("\n\tkey=%08x:%08x:%08x ",
1096 		    hashkey[0], hashkey[1], hashkey[2]);
1097 #endif
1098 	}
1099 #endif /* INET6 */
1100 
1101 	db_printf("hash=%08x idle_time=%03d"
1102 	    "\n\tfibnum=%02d rt=%p",
1103 	    fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
1104 
1105 #ifdef FLOWTABLE_HASH_ALL
1106 	if (fle->f_flags & FL_STALE)
1107 		db_printf(" FL_STALE ");
1108 #endif
1109 	if (rt_valid) {
1110 		if (rt->rt_flags & RTF_UP)
1111 			db_printf(" RTF_UP ");
1112 	}
1113 	if (ifp_valid) {
1114 		if (ifp->if_flags & IFF_LOOPBACK)
1115 			db_printf(" IFF_LOOPBACK ");
1116 		if (ifp->if_flags & IFF_UP)
1117 			db_printf(" IFF_UP ");
1118 		if (ifp->if_flags & IFF_POINTOPOINT)
1119 			db_printf(" IFF_POINTOPOINT ");
1120 	}
1121 	db_printf("\n");
1122 }
1123 
1124 static void
flowtable_show(struct flowtable * ft,int cpuid)1125 flowtable_show(struct flowtable *ft, int cpuid)
1126 {
1127 	int curbit = 0;
1128 	bitstr_t *mask, *tmpmask;
1129 
1130 	if (cpuid != -1)
1131 		db_printf("cpu: %d\n", cpuid);
1132 	mask = flowtable_mask_pcpu(ft, cpuid);
1133 	tmpmask = ft->ft_tmpmask;
1134 	memcpy(tmpmask, mask, ft->ft_size/8);
1135 	/*
1136 	 * XXX Note to self, bit_ffs operates at the byte level
1137 	 * and thus adds gratuitous overhead
1138 	 */
1139 	bit_ffs(tmpmask, ft->ft_size, &curbit);
1140 	while (curbit != -1) {
1141 		struct flist *flist;
1142 		struct flentry *fle;
1143 
1144 		if (curbit >= ft->ft_size || curbit < -1) {
1145 			db_printf("warning: bad curbit value %d \n",
1146 			    curbit);
1147 			break;
1148 		}
1149 
1150 		flist = flowtable_list_pcpu(ft, curbit, cpuid);
1151 
1152 		SLIST_FOREACH(fle, flist, f_next)
1153 			flow_show(ft, fle);
1154 		bit_clear(tmpmask, curbit);
1155 		bit_ffs(tmpmask, ft->ft_size, &curbit);
1156 	}
1157 }
1158 
1159 static void
flowtable_show_vnet(struct flowtable * ft)1160 flowtable_show_vnet(struct flowtable *ft)
1161 {
1162 
1163 	int i;
1164 
1165 	CPU_FOREACH(i)
1166 		flowtable_show(ft, i);
1167 }
1168 
DB_SHOW_COMMAND(flowtables,db_show_flowtables)1169 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1170 {
1171 	VNET_ITERATOR_DECL(vnet_iter);
1172 
1173 	VNET_FOREACH(vnet_iter) {
1174 		CURVNET_SET(vnet_iter);
1175 #ifdef VIMAGE
1176 		db_printf("vnet %p\n", vnet_iter);
1177 #endif
1178 #ifdef INET
1179 		printf("IPv4:\n");
1180 		flowtable_show_vnet(&V_ip4_ft);
1181 #endif
1182 #ifdef INET6
1183 		printf("IPv6:\n");
1184 		flowtable_show_vnet(&V_ip6_ft);
1185 #endif
1186 		CURVNET_RESTORE();
1187 	}
1188 }
1189 #endif
1190