1 /*-
2  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3  * All rights reserved.
4  *
5  * This software was developed by Robert N. M. Watson under contract
6  * to Juniper Networks, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 
32 __FBSDID("$FreeBSD: stable/10/sys/netinet/in_pcbgroup.c 222748 2011-06-06 12:55:02Z rwatson $");
33 
34 #include "opt_inet6.h"
35 
36 #include <sys/param.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/mutex.h>
41 #include <sys/smp.h>
42 #include <sys/socketvar.h>
43 
44 #include <netinet/in.h>
45 #include <netinet/in_pcb.h>
46 #ifdef INET6
47 #include <netinet6/in6_pcb.h>
48 #endif /* INET6 */
49 
50 /*
51  * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
52  * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
53  * Strategies in Modern Operating Systems".  This implementation differs
54  * significantly from that described in the paper, in that it attempts to
55  * introduce not just notions of affinity for connections and distribute work
56  * so as to reduce lock contention, but also align those notions with
57  * hardware work distribution strategies such as RSS.  In this construction,
58  * connection groups supplement, rather than replace, existing reservation
59  * tables for protocol 4-tuples, offering CPU-affine lookup tables with
60  * minimal cache line migration and lock contention during steady state
61  * operation.
62  *
63  * Internet protocols, such as UDP and TCP, register to use connection groups
64  * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
65  * indicates to the connection group code whether a 2-tuple or 4-tuple is
66  * used as an argument to hashes that assign a connection to a particular
67  * group.  This must be aligned with any hardware offloaded distribution
68  * model, such as RSS or similar approaches taken in embedded network boards.
69  * Wildcard sockets require special handling, as in Willman 2006, and are
70  * shared between connection groups -- while being protected by group-local
71  * locks.  This means that connection establishment and teardown can be
72  * signficantly more expensive than without connection groups, but that
73  * steady-state processing can be significantly faster.
74  *
75  * Most of the implementation of connection groups is in this file; however,
76  * connection group lookup is implemented in in_pcb.c alongside reservation
77  * table lookups -- see in_pcblookup_group().
78  *
79  * TODO:
80  *
81  * Implement dynamic rebalancing of buckets with connection groups; when
82  * load is unevenly distributed, search for more optimal balancing on
83  * demand.  This might require scaling up the number of connection groups
84  * by <<1.
85  *
86  * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
87  * groups for ip_input and ip6_input, allowing non-offloaded work
88  * distribution.
89  *
90  * Expose effective CPU affinity of connections to userspace using socket
91  * options.
92  *
93  * Investigate per-connection affinity overrides based on socket options; an
94  * option could be set, certainly resulting in work being distributed
95  * differently in software, and possibly propagated to supporting hardware
96  * with TCAMs or hardware hash tables.  This might require connections to
97  * exist in more than one connection group at a time.
98  *
99  * Hook netisr thread reconfiguration events, and propagate those to RSS so
100  * that rebalancing can occur when the thread pool grows or shrinks.
101  *
102  * Expose per-pcbgroup statistics to userspace monitoring tools such as
103  * netstat, in order to allow better debugging and profiling.
104  */
105 
106 void
in_pcbgroup_init(struct inpcbinfo * pcbinfo,u_int hashfields,int hash_nelements)107 in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
108     int hash_nelements)
109 {
110 	struct inpcbgroup *pcbgroup;
111 	u_int numpcbgroups, pgn;
112 
113 	/*
114 	 * Only enable connection groups for a protocol if it has been
115 	 * specifically requested.
116 	 */
117 	if (hashfields == IPI_HASHFIELDS_NONE)
118 		return;
119 
120 	/*
121 	 * Connection groups are about multi-processor load distribution,
122 	 * lock contention, and connection CPU affinity.  As such, no point
123 	 * in turning them on for a uniprocessor machine, it only wastes
124 	 * memory.
125 	 */
126 	if (mp_ncpus == 1)
127 		return;
128 
129 	/*
130 	 * Use one group per CPU for now.  If we decide to do dynamic
131 	 * rebalancing a la RSS, we'll need to shift left by at least 1.
132 	 */
133 	numpcbgroups = mp_ncpus;
134 
135 	pcbinfo->ipi_hashfields = hashfields;
136 	pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
137 	    sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
138 	pcbinfo->ipi_npcbgroups = numpcbgroups;
139 	pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
140 	    &pcbinfo->ipi_wildmask);
141 	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
142 		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
143 		pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
144 		    &pcbgroup->ipg_hashmask);
145 		INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
146 
147 		/*
148 		 * Initialise notional affinity of the pcbgroup -- for RSS,
149 		 * we want the same notion of affinity as NICs to be used.
150 		 * Just round robin for the time being.
151 		 */
152 		pcbgroup->ipg_cpu = (pgn % mp_ncpus);
153 	}
154 }
155 
156 void
in_pcbgroup_destroy(struct inpcbinfo * pcbinfo)157 in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
158 {
159 	struct inpcbgroup *pcbgroup;
160 	u_int pgn;
161 
162 	if (pcbinfo->ipi_npcbgroups == 0)
163 		return;
164 
165 	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
166 		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
167 		KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead),
168 		    ("in_pcbinfo_destroy: listhead not empty"));
169 		INP_GROUP_LOCK_DESTROY(pcbgroup);
170 		hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
171 		    pcbgroup->ipg_hashmask);
172 	}
173 	hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
174 	free(pcbinfo->ipi_pcbgroups, M_PCB);
175 	pcbinfo->ipi_pcbgroups = NULL;
176 	pcbinfo->ipi_npcbgroups = 0;
177 	pcbinfo->ipi_hashfields = 0;
178 }
179 
180 /*
181  * Given a hash of whatever the covered tuple might be, return a pcbgroup
182  * index.
183  */
184 static __inline u_int
in_pcbgroup_getbucket(struct inpcbinfo * pcbinfo,uint32_t hash)185 in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
186 {
187 
188 	return (hash % pcbinfo->ipi_npcbgroups);
189 }
190 
191 /*
192  * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
193  * information is insufficient to identify the pcbgroup.
194  */
195 struct inpcbgroup *
in_pcbgroup_byhash(struct inpcbinfo * pcbinfo,u_int hashtype,uint32_t hash)196 in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
197 {
198 
199 	return (NULL);
200 }
201 
202 static struct inpcbgroup *
in_pcbgroup_bymbuf(struct inpcbinfo * pcbinfo,struct mbuf * m)203 in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
204 {
205 
206 	return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
207 	    m->m_pkthdr.flowid));
208 }
209 
210 struct inpcbgroup *
in_pcbgroup_bytuple(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,struct in_addr faddr,u_short fport)211 in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
212     u_short lport, struct in_addr faddr, u_short fport)
213 {
214 	uint32_t hash;
215 
216 	switch (pcbinfo->ipi_hashfields) {
217 	case IPI_HASHFIELDS_4TUPLE:
218 		hash = faddr.s_addr ^ fport;
219 		break;
220 
221 	case IPI_HASHFIELDS_2TUPLE:
222 		hash = faddr.s_addr ^ laddr.s_addr;
223 		break;
224 
225 	default:
226 		hash = 0;
227 	}
228 	return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
229 	    hash)]);
230 }
231 
232 struct inpcbgroup *
in_pcbgroup_byinpcb(struct inpcb * inp)233 in_pcbgroup_byinpcb(struct inpcb *inp)
234 {
235 
236 	return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
237 	    inp->inp_lport, inp->inp_faddr, inp->inp_fport));
238 }
239 
240 static void
in_pcbwild_add(struct inpcb * inp)241 in_pcbwild_add(struct inpcb *inp)
242 {
243 	struct inpcbinfo *pcbinfo;
244 	struct inpcbhead *head;
245 	u_int pgn;
246 
247 	INP_WLOCK_ASSERT(inp);
248 	KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
249 	    ("%s: is wild",__func__));
250 
251 	pcbinfo = inp->inp_pcbinfo;
252 	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
253 		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
254 	head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
255 	    0, pcbinfo->ipi_wildmask)];
256 	LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
257 	inp->inp_flags2 |= INP_PCBGROUPWILD;
258 	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
259 		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
260 }
261 
262 static void
in_pcbwild_remove(struct inpcb * inp)263 in_pcbwild_remove(struct inpcb *inp)
264 {
265 	struct inpcbinfo *pcbinfo;
266 	u_int pgn;
267 
268 	INP_WLOCK_ASSERT(inp);
269 	KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
270 	    ("%s: not wild", __func__));
271 
272 	pcbinfo = inp->inp_pcbinfo;
273 	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
274 		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
275 	LIST_REMOVE(inp, inp_pcbgroup_wild);
276 	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
277 		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
278 	inp->inp_flags2 &= ~INP_PCBGROUPWILD;
279 }
280 
281 static __inline int
in_pcbwild_needed(struct inpcb * inp)282 in_pcbwild_needed(struct inpcb *inp)
283 {
284 
285 #ifdef INET6
286 	if (inp->inp_vflag & INP_IPV6)
287 		return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
288 	else
289 #endif
290 		return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
291 }
292 
293 static void
in_pcbwild_update_internal(struct inpcb * inp)294 in_pcbwild_update_internal(struct inpcb *inp)
295 {
296 	int wildcard_needed;
297 
298 	wildcard_needed = in_pcbwild_needed(inp);
299 	if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
300 		in_pcbwild_add(inp);
301 	else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
302 		in_pcbwild_remove(inp);
303 }
304 
305 /*
306  * Update the pcbgroup of an inpcb, which might include removing an old
307  * pcbgroup reference and/or adding a new one.  Wildcard processing is not
308  * performed here, although ideally we'll never install a pcbgroup for a
309  * wildcard inpcb (asserted below).
310  */
311 static void
in_pcbgroup_update_internal(struct inpcbinfo * pcbinfo,struct inpcbgroup * newpcbgroup,struct inpcb * inp)312 in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
313     struct inpcbgroup *newpcbgroup, struct inpcb *inp)
314 {
315 	struct inpcbgroup *oldpcbgroup;
316 	struct inpcbhead *pcbhash;
317 	uint32_t hashkey_faddr;
318 
319 	INP_WLOCK_ASSERT(inp);
320 
321 	oldpcbgroup = inp->inp_pcbgroup;
322 	if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
323 		INP_GROUP_LOCK(oldpcbgroup);
324 		LIST_REMOVE(inp, inp_pcbgrouphash);
325 		inp->inp_pcbgroup = NULL;
326 		INP_GROUP_UNLOCK(oldpcbgroup);
327 	}
328 	if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
329 #ifdef INET6
330 		if (inp->inp_vflag & INP_IPV6)
331 			hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */
332 		else
333 #endif
334 			hashkey_faddr = inp->inp_faddr.s_addr;
335 		INP_GROUP_LOCK(newpcbgroup);
336 		pcbhash = &newpcbgroup->ipg_hashbase[
337 		    INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport,
338 		    newpcbgroup->ipg_hashmask)];
339 		LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
340 		inp->inp_pcbgroup = newpcbgroup;
341 		INP_GROUP_UNLOCK(newpcbgroup);
342 	}
343 
344 	KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
345 	    ("%s: pcbgroup and wildcard!", __func__));
346 }
347 
348 /*
349  * Two update paths: one in which the 4-tuple on an inpcb has been updated
350  * and therefore connection groups may need to change (or a wildcard entry
351  * may needed to be installed), and another in which the 4-tuple has been
352  * set as a result of a packet received, in which case we may be able to use
353  * the hash on the mbuf to avoid doing a software hash calculation for RSS.
354  *
355  * In each case: first, let the wildcard code have a go at placing it as a
356  * wildcard socket.  If it was a wildcard, or if the connection has been
357  * dropped, then no pcbgroup is required (so potentially clear it);
358  * otherwise, calculate and update the pcbgroup for the inpcb.
359  */
360 void
in_pcbgroup_update(struct inpcb * inp)361 in_pcbgroup_update(struct inpcb *inp)
362 {
363 	struct inpcbinfo *pcbinfo;
364 	struct inpcbgroup *newpcbgroup;
365 
366 	INP_WLOCK_ASSERT(inp);
367 
368 	pcbinfo = inp->inp_pcbinfo;
369 	if (!in_pcbgroup_enabled(pcbinfo))
370 		return;
371 
372 	in_pcbwild_update_internal(inp);
373 	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
374 	    !(inp->inp_flags & INP_DROPPED)) {
375 #ifdef INET6
376 		if (inp->inp_vflag & INP_IPV6)
377 			newpcbgroup = in6_pcbgroup_byinpcb(inp);
378 		else
379 #endif
380 			newpcbgroup = in_pcbgroup_byinpcb(inp);
381 	} else
382 		newpcbgroup = NULL;
383 	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
384 }
385 
386 void
in_pcbgroup_update_mbuf(struct inpcb * inp,struct mbuf * m)387 in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
388 {
389 	struct inpcbinfo *pcbinfo;
390 	struct inpcbgroup *newpcbgroup;
391 
392 	INP_WLOCK_ASSERT(inp);
393 
394 	pcbinfo = inp->inp_pcbinfo;
395 	if (!in_pcbgroup_enabled(pcbinfo))
396 		return;
397 
398 	/*
399 	 * Possibly should assert !INP_PCBGROUPWILD rather than testing for
400 	 * it; presumably this function should never be called for anything
401 	 * other than non-wildcard socket?
402 	 */
403 	in_pcbwild_update_internal(inp);
404 	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
405 	    !(inp->inp_flags & INP_DROPPED)) {
406 		newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
407 #ifdef INET6
408 		if (inp->inp_vflag & INP_IPV6) {
409 			if (newpcbgroup == NULL)
410 				newpcbgroup = in6_pcbgroup_byinpcb(inp);
411 		} else {
412 #endif
413 			if (newpcbgroup == NULL)
414 				newpcbgroup = in_pcbgroup_byinpcb(inp);
415 #ifdef INET6
416 		}
417 #endif
418 	} else
419 		newpcbgroup = NULL;
420 	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
421 }
422 
423 /*
424  * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
425  */
426 void
in_pcbgroup_remove(struct inpcb * inp)427 in_pcbgroup_remove(struct inpcb *inp)
428 {
429 	struct inpcbgroup *pcbgroup;
430 
431 	INP_WLOCK_ASSERT(inp);
432 
433 	if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
434 		return;
435 
436 	if (inp->inp_flags2 & INP_PCBGROUPWILD)
437 		in_pcbwild_remove(inp);
438 
439 	pcbgroup = inp->inp_pcbgroup;
440 	if (pcbgroup != NULL) {
441 		INP_GROUP_LOCK(pcbgroup);
442 		LIST_REMOVE(inp, inp_pcbgrouphash);
443 		inp->inp_pcbgroup = NULL;
444 		INP_GROUP_UNLOCK(pcbgroup);
445 	}
446 }
447 
448 /*
449  * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
450  * for a protocol.
451  */
452 int
in_pcbgroup_enabled(struct inpcbinfo * pcbinfo)453 in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
454 {
455 
456 	return (pcbinfo->ipi_npcbgroups > 0);
457 }
458