1 /*        $NetBSD: tcp_vtw.h,v 1.11 2024/10/07 23:17:00 jakllsch Exp $          */
2 /*
3  * Copyright (c) 2011 The NetBSD Foundation, Inc.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to The NetBSD Foundation
7  * by Coyote Point Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Vestigial time-wait.
33  *
34  * This implementation uses cache-efficient techniques, which will
35  * appear somewhat peculiar.  The main philosophy is to optimise the
36  * amount of information available within a cache line.  Cache miss is
37  * expensive.  So we employ ad-hoc techniques to pull a series of
38  * linked-list follows into a cache line.  One cache line, multiple
39  * linked-list equivalents.
40  *
41  * One such ad-hoc technique is fat pointers.  Additional degrees of
42  * ad-hoqueness result from having to hand tune it for pointer size
43  * and for cache line size.
44  *
45  * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
46  * data structures into one cache line.  The additional 32 bits in the
47  * cache line are used for linking fat pointers, and for
48  * allocation/bookkeeping.
49  *
50  * The 15 32-bit tags encode the pointers to the linked list elements,
51  * and also encode the results of a search comparison.
52  *
53  * First, some more assumptions/restrictions.
54  *
55  * All the fat pointers are from a contiguous allocation arena.  Thus,
56  * we can refer to them by offset from a base, not as full pointers.
57  *
58  * All the linked list data elements are also from a contiguous
59  * allocation arena, again so that we can refer to them as offset from
60  * a base.
61  *
62  * In order to add a data element to a fat pointer, a key value is
63  * computed, based on unique data within the data element.  It is the
64  * linear searching of the linked lists of these elements based on
65  * these unique data that are being optimised here.
66  *
67  * Lets call the function that computes the key k(e), where e is the
68  * data element.  In this example, k(e) returns 32-bits.
69  *
70  * Consider a set E (say of order 15) of data elements.  Let K be
71  * the set of the k(e) for e in E.
72  *
73  * Let O be the set of the offsets from the base of the data elements in E.
74  *
75  * For each x in K, for each matching o in O, let t be x ^ o.  These
76  * are the tags. (More or less).
77  *
78  * In order to search all the data elements in E, we compute the
79  * search key, and one at a time, XOR the key into the tags.  If any
80  * result is a valid data element index, we have a possible match.  If
81  * not, there is no match.
82  *
83  * The no-match cases mean we do not have to de-reference the pointer
84  * to the data element in question.  We save cache miss penalty and
85  * cache load decreases.  Only in the case of a valid looking data
86  * element index, do we have to look closer.
87  *
88  * Thus, in the absence of false positives, 15 data elements can be
89  * searched with one cache line fill, as opposed to 15 cache line
90  * fills for the usual implementation.
91  *
92  * The vestigial time waits (vtw_t), the data elements in the above, are
93  * searched by faddr, fport, laddr, lport.  The key is a function of
94  * these values.
95  *
96  * We hash these keys into the traditional hash chains to reduce the
97  * search time, and use fat pointers to reduce the cache impacts of
98  * searching.
99  *
100  * The vtw_t are, per requirement, in a contiguous chunk.  Allocation
101  * is done with a clock hand, and all vtw_t within one allocation
102  * domain have the same lifetime, so they will always be sorted by
103  * age.
104  *
105  * A vtw_t will be allocated, timestamped, and have a fixed future
106  * expiration.  It will be added to a hash bucket implemented with fat
107  * pointers, which means that a cache line will be allocated in the
108  * hash bucket, placed at the head (more recent in time) and the vtw_t
109  * will be added to this.  As more entries are added, the fat pointer
110  * cache line will fill, requiring additional cache lines for fat
111  * pointers to be allocated. These will be added at the head, and the
112  * aged entries will hang down, tapeworm like.  As the vtw_t entries
113  * expire, the corresponding slot in the fat pointer will be
114  * reclaimed, and eventually the cache line will completely empty and
115  * be re-cycled, if not at the head of the chain.
116  *
117  * At times, a time-wait timer is restarted.  This corresponds to
118  * deleting the current entry and re-adding it.
119  *
120  * Most of the time, they are just placed here to die.
121  */
122 #ifndef _NETINET_TCP_VTW_H
123 #define _NETINET_TCP_VTW_H
124 
125 #include <sys/types.h>
126 #include <sys/socket.h>
127 #include <sys/sysctl.h>
128 #include <net/if.h>
129 #include <netinet/in.h>
130 #include <netinet/in_systm.h>
131 #include <netinet/ip.h>
132 #include <netinet/in_pcb.h>
133 #include <netinet/in_var.h>
134 #include <netinet/ip_var.h>
135 #include <netinet/in.h>
136 #include <netinet/tcp.h>
137 #include <netinet/tcp_timer.h>
138 #include <netinet/tcp_var.h>
139 #include <netinet6/in6.h>
140 #include <netinet/ip6.h>
141 #include <netinet6/ip6_var.h>
142 #include <netinet6/in6_pcb.h>
143 #include <netinet6/ip6_var.h>
144 #include <netinet6/in6_var.h>
145 #include <netinet/icmp6.h>
146 
147 #define   VTW_NCLASS          (1+3)               /* # different classes */
148 
149 /*
150  * fat pointers, MI.
151  */
152 struct fatp_mi;
153 
154 #if CACHE_LINE_SIZE >= 128
155 typedef uint64_t fatp_word_t;
156 #else
157 typedef uint32_t fatp_word_t;
158 #endif
159 
160 typedef struct fatp_mi        fatp_t;
161 
162 /* Supported cacheline sizes: 32 64 128 bytes.  See fatp_key(),
163  * fatp_slot_from_key(), fatp_xtra[].
164  */
165 #define   FATP_NTAGS          (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1)
166 #define   FATP_NXT_WIDTH      (sizeof(fatp_word_t) * NBBY - FATP_NTAGS)
167 
168 #define   FATP_MAX  (1 << (FATP_NXT_WIDTH < 31 ? FATP_NXT_WIDTH : 31))
169 
170 /* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
171  * 15 tags per cacheline.  At most 2^17 fat pointers per fatp_ctl_t.
172  * The comments on the fatp_mi members, below, correspond to the worked
173  * example.
174  */
175 struct fatp_mi {
176           fatp_word_t         inuse     : FATP_NTAGS;       /* (1+15)*4 == CL_SIZE */
177           fatp_word_t         nxt       : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
178           fatp_word_t         tag[FATP_NTAGS];    /* 15 tags per CL */
179 };
180 
181 static __inline int
fatp_ntags(void)182 fatp_ntags(void)
183 {
184           return FATP_NTAGS;
185 }
186 
187 static __inline int
fatp_full(fatp_t * fp)188 fatp_full(fatp_t *fp)
189 {
190           fatp_t full;
191 
192           full.inuse = (1U << FATP_NTAGS) - 1U;
193 
194           return (fp->inuse == full.inuse);
195 }
196 
197 struct vtw_common;
198 struct vtw_v4;
199 struct vtw_v6;
200 struct vtw_ctl;
201 
202 /*!\brief common to all vtw
203  */
204 typedef struct vtw_common {
205           struct timeval      expire;             /* date of birth+msl */
206           uint32_t  key;                /* hash key: full hash */
207           uint32_t  port_key; /* hash key: local port hash */
208           uint32_t  rcv_nxt;
209           uint32_t  rcv_wnd;
210           uint32_t  snd_nxt;
211           uint32_t  snd_scale : 8;      /* window scaling for send win */
212           uint32_t  msl_class : 2;      /* TCP MSL class {0,1,2,3} */
213           uint32_t  reuse_port          : 1;
214           uint32_t  reuse_addr          : 1;
215           uint32_t  v6only              : 1;
216           uint32_t  hashed              : 1;      /* reachable via FATP */
217           uint32_t  uid;
218 } vtw_t;
219 
220 /*!\brief vestigial timewait for IPv4
221  */
222 typedef struct vtw_v4 {
223           vtw_t               common;             /*  must be first */
224           uint16_t  lport;
225           uint16_t  fport;
226           uint32_t  laddr;
227           uint32_t  faddr;
228 } vtw_v4_t;
229 
230 /*!\brief vestigial timewait for IPv6
231  */
232 typedef struct vtw_v6 {
233           vtw_t               common;             /* must be first */
234           uint16_t  lport;
235           uint16_t  fport;
236           struct in6_addr     laddr;
237           struct in6_addr     faddr;
238 } vtw_v6_t;
239 
240 struct fatp_ctl;
241 typedef struct vtw_ctl                  vtw_ctl_t;
242 typedef struct fatp_ctl                 fatp_ctl_t;
243 
244 /*
245  * The vestigial time waits are kept in a contiguous chunk.
246  * Allocation and free pointers run as clock hands thru this array.
247  */
248 struct vtw_ctl {
249           fatp_ctl_t          *fat;               /* collection of fatp to use  */
250           vtw_ctl_t *ctl;               /* <! controller's controller */
251           union {
252                     vtw_t               *v;       /* common                     */
253                     struct vtw_v4       *v4;      /* IPv4 resources             */
254                     struct vtw_v6       *v6;      /* IPv6 resources             */
255           }                   base,               /* base of vtw_t array                  */
256                     /**/      lim,                /* extent of vtw_t array      */
257                     /**/      alloc,              /* allocation pointer                   */
258                     /**/      oldest;             /* ^ to oldest                          */
259           uint32_t  nfree;              /* # free                     */
260           uint32_t  nalloc;             /* # allocated                          */
261           uint32_t  idx_mask; /* mask capturing all index bits*/
262           uint32_t  is_v4     : 1;
263           uint32_t  is_v6     : 1;
264           uint32_t  idx_bits: 6;
265           uint32_t  clidx     : 3;      /* <! class index */
266 };
267 
268 /*!\brief Collections of fat pointers.
269  */
270 struct fatp_ctl {
271           vtw_ctl_t *vtw;               /* associated VTWs            */
272           fatp_t              *base;              /* base of fatp_t array                 */
273           fatp_t              *lim;               /* extent of fatp_t array     */
274           fatp_t              *free;              /* free list                            */
275           uint32_t  mask;               /* hash mask                            */
276           uint32_t  nfree;              /* # free                     */
277           uint32_t  nalloc;             /* # allocated                          */
278           fatp_t              **hash;             /* hash anchors                         */
279           fatp_t              **port;             /* port hash anchors                    */
280 };
281 
282 /*!\brief stats
283  */
284 struct vtw_stats {
285           uint64_t  ins;                /* <! inserts */
286           uint64_t  del;                /* <! deleted */
287           uint64_t  kill;               /* <! assassination */
288           uint64_t  look[2];  /* <! lookup: full hash, port hash */
289           uint64_t  hit[2];             /* <! lookups that hit */
290           uint64_t  miss[2];  /* <! lookups that miss */
291           uint64_t  probe[2]; /* <! hits+miss */
292           uint64_t  losing[2];          /* <! misses requiring dereference */
293           uint64_t  max_chain[2];       /* <! max fatp chain traversed */
294           uint64_t  max_probe[2];       /* <! max probes in any one chain */
295           uint64_t  max_loss[2];        /* <! max losing probes in any one
296                                                    * chain
297                                                    */
298 };
299 
300 typedef struct vtw_stats      vtw_stats_t;
301 
302 /*!\brief follow fatp next 'pointer'
303  */
304 static __inline fatp_t *
fatp_next(fatp_ctl_t * fat,fatp_t * fp)305 fatp_next(fatp_ctl_t *fat, fatp_t *fp)
306 {
307           return fp->nxt ? fat->base + fp->nxt-1 : 0;
308 }
309 
310 /*!\brief determine a collection-relative fat pointer index.
311  */
312 static __inline uint32_t
fatp_index(fatp_ctl_t * fat,fatp_t * fp)313 fatp_index(fatp_ctl_t *fat, fatp_t *fp)
314 {
315           return fp ? 1 + (fp - fat->base) : 0;
316 }
317 
318 
319 static __inline uint32_t
v4_tag(uint32_t faddr,uint32_t fport,uint32_t laddr,uint32_t lport)320 v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport)
321 {
322           return (ntohl(faddr)   + ntohs(fport)
323                     + ntohl(laddr) + ntohs(lport));
324 }
325 
326 static __inline uint32_t
v6_tag(const struct in6_addr * faddr,uint16_t fport,const struct in6_addr * laddr,uint16_t lport)327 v6_tag(const struct in6_addr *faddr, uint16_t fport,
328        const struct in6_addr *laddr, uint16_t lport)
329 {
330 #ifdef IN6_HASH
331           return IN6_HASH(faddr, fport, laddr, lport);
332 #else
333           return 0;
334 #endif
335 }
336 
337 static __inline uint32_t
v4_port_tag(uint16_t lport)338 v4_port_tag(uint16_t lport)
339 {
340           uint32_t tag = lport ^ (lport << 11);
341 
342           tag ^= tag << 3;
343           tag += tag >> 5;
344           tag ^= tag << 4;
345           tag += tag >> 17;
346           tag ^= tag << 25;
347           tag += tag >> 6;
348 
349           return tag;
350 }
351 
352 static __inline uint32_t
v6_port_tag(uint16_t lport)353 v6_port_tag(uint16_t lport)
354 {
355           return v4_port_tag(lport);
356 }
357 
358 struct tcpcb;
359 struct tcphdr;
360 
361 int  vtw_add(int, struct tcpcb *);
362 void vtw_del(vtw_ctl_t *, vtw_t *);
363 int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th,
364                       uint32_t faddr, uint16_t fport,
365                       uint32_t laddr, uint16_t lport);
366 struct ip6_hdr;
367 struct in6_addr;
368 
369 int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th,
370                       const struct in6_addr *faddr, uint16_t fport,
371                       const struct in6_addr *laddr, uint16_t lport);
372 
373 typedef struct vestigial_inpcb {
374           union {
375                     struct in_addr      v4;
376                     struct in6_addr     v6;
377           } faddr, laddr;
378           uint16_t            fport, lport;
379           uint32_t            valid               : 1;
380           uint32_t            v4                  : 1;
381           uint32_t            reuse_addr          : 1;
382           uint32_t            reuse_port          : 1;
383           uint32_t            v6only              : 1;
384           uint32_t            more_tbd  : 1;
385           uint32_t            uid;
386           uint32_t            rcv_nxt;
387           uint32_t            rcv_wnd;
388           uint32_t            snd_nxt;
389           struct vtw_common   *vtw;
390           struct vtw_ctl                *ctl;
391 } vestigial_inpcb_t;
392 
393 #ifdef _KERNEL
394 void vtw_restart(vestigial_inpcb_t*);
395 int vtw_earlyinit(void);
396 int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
397 #endif /* _KERNEL */
398 
399 #ifdef VTW_DEBUG
400 typedef struct sin_either {
401           uint8_t             sin_len;
402           uint8_t             sin_family;
403           uint16_t  sin_port;
404           union {
405                     struct in_addr      v4;
406                     struct in6_addr     v6;
407           }                   sin_addr;
408 } sin_either_t;
409 
410 int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int);
411 
412 typedef struct vtw_sysargs {
413           uint32_t  op;
414           sin_either_t        fa;
415           sin_either_t        la;
416 } vtw_sysargs_t;
417 
418 #endif /* VTW_DEBUG */
419 
420 #endif /* _NETINET_TCP_VTW_H */
421