1 /*-
2  * Copyright (c) 2019 Mindaugas Rasiukevicius <rmind at noxt eu>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28  * NPF port map mechanism.
29  *
30  *        The port map is a bitmap used to track TCP/UDP ports used for
31  *        translation.  Port maps are per IP addresses, therefore multiple
32  *        NAT policies operating on the same IP address will share the
33  *        same port map.
34  */
35 
36 #ifdef _KERNEL
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: npf_portmap.c,v 1.7 2020/08/28 06:35:50 riastradh Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/types.h>
42 
43 #include <sys/atomic.h>
44 #include <sys/bitops.h>
45 #include <sys/kmem.h>
46 #include <sys/mutex.h>
47 #include <sys/cprng.h>
48 #include <sys/thmap.h>
49 #endif
50 
51 #include "npf_impl.h"
52 
53 /*
54  * Port map uses two-level bitmaps with compression to efficiently
55  * represent the maximum of 65536 (2^16) values.
56  *
57  * Level 0: 64 chunks each representing 1048 bits in two modes:
58  *
59  *        a) If PORTMAP_L1_TAG, then up to 5 values are packed in the
60  *        64-bit integer using 12 bits for each value, starting from the
61  *        most significant bits.  The four 4 least significant bits are
62  *        unused or reserved for pointer tagging.
63  *
64  *        b) If there are more than 5 values, then PORTMAP_L1_TAG is set
65  *        and the value serves as a pointer to the second level bitmap.
66  *
67  * Level 1: 16 chunks each representing 64 bits in plain uint64_t.
68  */
69 
70 #define   PORTMAP_MAX_BITS    (65536U)
71 #define   PORTMAP_MASK                  (PORTMAP_MAX_BITS - 1)
72 
73 #define   PORTMAP_L0_SHIFT    (10) // or 11
74 #define   PORTMAP_L0_MASK               ((1U << PORTMAP_L0_SHIFT) - 1)
75 #define   PORTMAP_L0_WORDS    (PORTMAP_MAX_BITS >> PORTMAP_L0_SHIFT)
76 
77 #define   PORTMAP_L1_SHIFT    (6)
78 #define   PORTMAP_L1_MASK               ((1U << PORTMAP_L1_SHIFT) - 1)
79 #define   PORTMAP_L1_WORDS    \
80     ((PORTMAP_MAX_BITS / PORTMAP_L0_WORDS) >> PORTMAP_L1_SHIFT)
81 
82 #define   PORTMAP_L1_TAG                (UINT64_C(1)) // use level 1
83 #define   PORTMAP_L1_GET(p)   ((void *)((uintptr_t)(p) & ~(uintptr_t)3))
84 
85 CTASSERT(sizeof(uint64_t) >= sizeof(uintptr_t));
86 
87 typedef struct {
88           volatile uint64_t   bits1[PORTMAP_L1_WORDS];
89 } bitmap_l1_t;
90 
91 typedef struct bitmap {
92           npf_addr_t                    addr;
93           volatile uint64_t   bits0[PORTMAP_L0_WORDS];
94           LIST_ENTRY(bitmap)  entry;
95           unsigned            addr_len;
96 } bitmap_t;
97 
98 #define   NPF_PORTMAP_MINPORT 1024
99 #define   NPF_PORTMAP_MAXPORT 65535
100 
101 struct npf_portmap {
102           thmap_t   *                   addr_map;
103           LIST_HEAD(, bitmap) bitmap_list;
104           kmutex_t            list_lock;
105           int                           min_port;
106           int                           max_port;
107 };
108 
109 static kmutex_t                         portmap_lock;
110 
111 void
npf_portmap_sysinit(void)112 npf_portmap_sysinit(void)
113 {
114 
115           mutex_init(&portmap_lock, MUTEX_DEFAULT, IPL_SOFTNET);
116 }
117 
118 void
npf_portmap_sysfini(void)119 npf_portmap_sysfini(void)
120 {
121 
122           mutex_destroy(&portmap_lock);
123 }
124 
125 void
npf_portmap_init(npf_t * npf)126 npf_portmap_init(npf_t *npf)
127 {
128           npf_portmap_t *pm = npf_portmap_create(
129               NPF_PORTMAP_MINPORT, NPF_PORTMAP_MAXPORT);
130           npf_param_t param_map[] = {
131                     {
132                               "portmap.min_port",
133                               &pm->min_port,
134                               .default_val = NPF_PORTMAP_MINPORT,
135                               .min = 1024, .max = 65535
136                     },
137                     {
138                               "portmap.max_port",
139                               &pm->max_port,
140                               .default_val = 49151, // RFC 6335
141                               .min = 1024, .max = 65535
142                     }
143           };
144 
145           npf_param_register(npf, param_map, __arraycount(param_map));
146           npf->portmap = pm;
147 }
148 
149 void
npf_portmap_fini(npf_t * npf)150 npf_portmap_fini(npf_t *npf)
151 {
152 
153           npf_portmap_destroy(npf->portmap);
154           npf->portmap = NULL; // diagnostic
155 }
156 
157 npf_portmap_t *
npf_portmap_create(int min_port,int max_port)158 npf_portmap_create(int min_port, int max_port)
159 {
160           npf_portmap_t *pm;
161 
162           pm = kmem_zalloc(sizeof(npf_portmap_t), KM_SLEEP);
163           mutex_init(&pm->list_lock, MUTEX_DEFAULT, IPL_SOFTNET);
164           pm->addr_map = thmap_create(0, NULL, THMAP_NOCOPY);
165           pm->min_port = min_port;
166           pm->max_port = max_port;
167           return pm;
168 }
169 
170 void
npf_portmap_destroy(npf_portmap_t * pm)171 npf_portmap_destroy(npf_portmap_t *pm)
172 {
173           npf_portmap_flush(pm);
174           KASSERT(LIST_EMPTY(&pm->bitmap_list));
175 
176           thmap_destroy(pm->addr_map);
177           mutex_destroy(&pm->list_lock);
178           kmem_free(pm, sizeof(npf_portmap_t));
179 }
180 
181 /////////////////////////////////////////////////////////////////////////
182 
183 #if defined(_LP64)
184 #define   __npf_atomic_cas_64 atomic_cas_64
185 #else
186 static uint64_t
__npf_atomic_cas_64(volatile uint64_t * ptr,uint64_t old,uint64_t new)187 __npf_atomic_cas_64(volatile uint64_t *ptr, uint64_t old, uint64_t new)
188 {
189           uint64_t prev;
190 
191           mutex_enter(&portmap_lock);
192           prev = *ptr;
193           if (prev == old) {
194                     *ptr = new;
195           }
196           mutex_exit(&portmap_lock);
197 
198           return prev;
199 }
200 #endif
201 
202 /*
203  * bitmap_word_isset: test whether the bit value is in the packed array.
204  *
205  * => Return true if any value equals the bit number value.
206  *
207  * Packed array: 60 MSB bits, 5 values, 12 bits each.
208  *
209  * Reference: "Bit Twiddling Hacks" by S.E. Anderson, Stanford.
210  * Based on the hasvalue() and haszero() ideas.  Since values are
211  * represented by upper 60 bits, we shift right by 4.
212  */
213 static bool
bitmap_word_isset(uint64_t x,unsigned bit)214 bitmap_word_isset(uint64_t x, unsigned bit)
215 {
216           uint64_t m, r;
217 
218           bit++;
219           KASSERT((x & PORTMAP_L1_TAG) == 0);
220           KASSERT(bit <= (PORTMAP_L0_MASK + 1));
221 
222           m = (x >> 4) ^ (UINT64_C(0x1001001001001) * bit);
223           r = (m - UINT64_C(0x1001001001001)) & (~m & UINT64_C(0x800800800800800));
224           return r != 0;
225 }
226 
227 /*
228  * bitmap_word_cax: compare-and-xor on packed array elements.
229  */
230 static uint64_t
bitmap_word_cax(uint64_t x,int exp,int bit)231 bitmap_word_cax(uint64_t x, int exp, int bit)
232 {
233           unsigned e = exp + 1;
234 
235           /*
236            * We need to distinguish "no value" from zero.  Just add one,
237            * since we use 12 bits to represent 11 bit values.
238            */
239           bit++;
240           KASSERT((unsigned)bit <= (PORTMAP_L0_MASK + 1));
241           KASSERT((x & PORTMAP_L1_TAG) == 0);
242 
243           if (((x >> 52) & 0xfff) == e)
244                     return x ^ ((uint64_t)bit << 52);
245           if (((x >> 40) & 0xfff) == e)
246                     return x ^ ((uint64_t)bit << 40);
247           if (((x >> 28) & 0xfff) == e)
248                     return x ^ ((uint64_t)bit << 28);
249           if (((x >> 16) & 0xfff) == e)
250                     return x ^ ((uint64_t)bit << 16);
251           if (((x >>  4) & 0xfff) == e)
252                     return x ^ ((uint64_t)bit << 4);
253           return 0;
254 }
255 
256 static unsigned
bitmap_word_unpack(uint64_t x,unsigned bitvals[static5])257 bitmap_word_unpack(uint64_t x, unsigned bitvals[static 5])
258 {
259           unsigned n = 0;
260           uint64_t v;
261 
262           KASSERT((x & PORTMAP_L1_TAG) == 0);
263 
264           if ((v = ((x >> 52)) & 0xfff) != 0)
265                     bitvals[n++] = v - 1;
266           if ((v = ((x >> 40)) & 0xfff) != 0)
267                     bitvals[n++] = v - 1;
268           if ((v = ((x >> 28)) & 0xfff) != 0)
269                     bitvals[n++] = v - 1;
270           if ((v = ((x >> 16)) & 0xfff) != 0)
271                     bitvals[n++] = v - 1;
272           if ((v = ((x >>  4)) & 0xfff) != 0)
273                     bitvals[n++] = v - 1;
274           return n;
275 }
276 
277 #if 0
278 static bool
279 bitmap_isset(const bitmap_t *bm, unsigned bit)
280 {
281           unsigned i, chunk_bit;
282           uint64_t bval, b;
283           bitmap_l1_t *bm1;
284 
285           KASSERT(bit < PORTMAP_MAX_BITS);
286           i = bit >> PORTMAP_L0_SHIFT;
287           bval = atomic_load_relaxed(&bm->bits0[i]);
288 
289           /*
290            * Empty check.  Note: we can test the whole word against zero,
291            * since zero bit values in the packed array result in bits set.
292            */
293           if (bval == 0)
294                     return false;
295 
296           /* Level 0 check. */
297           chunk_bit = bit & PORTMAP_L0_MASK;
298           if ((bval & PORTMAP_L1_TAG) == 0)
299                     return bitmap_word_isset(bval, chunk_bit);
300 
301           /* Level 1 check. */
302           bm1 = PORTMAP_L1_GET(bval);
303           KASSERT(bm1 != NULL);
304           i = chunk_bit >> PORTMAP_L1_SHIFT;
305           b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK);
306           return (bm1->bits1[i] & b) != 0;
307 }
308 #endif
309 
310 static bool
bitmap_set(bitmap_t * bm,unsigned bit)311 bitmap_set(bitmap_t *bm, unsigned bit)
312 {
313           unsigned i, chunk_bit;
314           uint64_t bval, b, oval, nval;
315           bitmap_l1_t *bm1;
316 again:
317           KASSERT(bit < PORTMAP_MAX_BITS);
318           i = bit >> PORTMAP_L0_SHIFT;
319           chunk_bit = bit & PORTMAP_L0_MASK;
320           bval = bm->bits0[i];
321 
322           if ((bval & PORTMAP_L1_TAG) == 0) {
323                     unsigned n = 0, bitvals[5];
324                     uint64_t bm1p;
325 
326                     if (bitmap_word_isset(bval, chunk_bit)) {
327                               return false;
328                     }
329 
330                     /*
331                      * Look for a zero-slot and put a value there.
332                      */
333                     if ((nval = bitmap_word_cax(bval, -1, chunk_bit)) != 0) {
334                               KASSERT((nval & PORTMAP_L1_TAG) == 0);
335                               if (__npf_atomic_cas_64(&bm->bits0[i], bval, nval) != bval) {
336                                         goto again;
337                               }
338                               return true;
339                     }
340 
341                     /*
342                      * Full: allocate L1 block and copy over the current
343                      * values into the level.
344                      */
345                     bm1 = kmem_intr_zalloc(sizeof(bitmap_l1_t), KM_NOSLEEP);
346                     if (bm1 == NULL) {
347                               return false; // error
348                     }
349                     n = bitmap_word_unpack(bval, bitvals);
350                     while (n--) {
351                               const unsigned v = bitvals[n];
352                               const unsigned off = v >> PORTMAP_L1_SHIFT;
353 
354                               KASSERT(v <= PORTMAP_L0_MASK);
355                               KASSERT(off < (sizeof(uint64_t) * CHAR_BIT));
356                               bm1->bits1[off] |= UINT64_C(1) << (v & PORTMAP_L1_MASK);
357                     }
358 
359                     /*
360                      * Attempt to set the L1 structure.  Note: there is no
361                      * ABA problem since the we compare the actual values.
362                      * Note: CAS serves as a memory barrier.
363                      */
364                     bm1p = (uintptr_t)bm1;
365                     KASSERT((bm1p & PORTMAP_L1_TAG) == 0);
366                     bm1p |= PORTMAP_L1_TAG;
367                     if (__npf_atomic_cas_64(&bm->bits0[i], bval, bm1p) != bval) {
368                               kmem_intr_free(bm1, sizeof(bitmap_l1_t));
369                               goto again;
370                     }
371                     bval = bm1p;
372           }
373 
374           bm1 = PORTMAP_L1_GET(bval);
375           KASSERT(bm1 != NULL);
376           i = chunk_bit >> PORTMAP_L1_SHIFT;
377           b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK);
378 
379           oval = bm1->bits1[i];
380           if (oval & b) {
381                     return false;
382           }
383           nval = oval | b;
384           if (__npf_atomic_cas_64(&bm1->bits1[i], oval, nval) != oval) {
385                     goto again;
386           }
387           return true;
388 }
389 
390 static bool
bitmap_clr(bitmap_t * bm,unsigned bit)391 bitmap_clr(bitmap_t *bm, unsigned bit)
392 {
393           unsigned i, chunk_bit;
394           uint64_t bval, b, oval, nval;
395           bitmap_l1_t *bm1;
396 again:
397           KASSERT(bit < PORTMAP_MAX_BITS);
398           i = bit >> PORTMAP_L0_SHIFT;
399           chunk_bit = bit & PORTMAP_L0_MASK;
400           bval = bm->bits0[i];
401 
402           if ((bval & PORTMAP_L1_TAG) == 0) {
403                     if (!bitmap_word_isset(bval, chunk_bit)) {
404                               return false;
405                     }
406                     nval = bitmap_word_cax(bval, chunk_bit, chunk_bit);
407                     KASSERT((nval & PORTMAP_L1_TAG) == 0);
408                     if (__npf_atomic_cas_64(&bm->bits0[i], bval, nval) != bval) {
409                               goto again;
410                     }
411                     return true;
412           }
413 
414           bm1 = PORTMAP_L1_GET(bval);
415           KASSERT(bm1 != NULL);
416           i = chunk_bit >> PORTMAP_L1_SHIFT;
417           b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK);
418 
419           oval = bm1->bits1[i];
420           if ((oval & b) == 0) {
421                     return false;
422           }
423           nval = oval & ~b;
424           if (__npf_atomic_cas_64(&bm1->bits1[i], oval, nval) != oval) {
425                     goto again;
426           }
427           return true;
428 }
429 
430 /////////////////////////////////////////////////////////////////////////
431 
432 static bitmap_t *
npf_portmap_autoget(npf_portmap_t * pm,unsigned alen,const npf_addr_t * addr)433 npf_portmap_autoget(npf_portmap_t *pm, unsigned alen, const npf_addr_t *addr)
434 {
435           bitmap_t *bm;
436 
437           KASSERT(pm && pm->addr_map);
438           KASSERT(alen && alen <= sizeof(npf_addr_t));
439 
440           /* Lookup the port map for this address. */
441           bm = thmap_get(pm->addr_map, addr, alen);
442           if (bm == NULL) {
443                     void *ret;
444 
445                     /*
446                      * Allocate a new port map for this address and
447                      * attempt to insert it.
448                      */
449                     bm = kmem_intr_zalloc(sizeof(bitmap_t), KM_NOSLEEP);
450                     if (bm == NULL) {
451                               return NULL;
452                     }
453                     memcpy(&bm->addr, addr, alen);
454                     bm->addr_len = alen;
455 
456                     int s = splsoftnet();
457                     ret = thmap_put(pm->addr_map, &bm->addr, alen, bm);
458                     splx(s);
459 
460                     if (ret == bm) {
461                               /* Success: insert the bitmap into the list. */
462                               mutex_enter(&pm->list_lock);
463                               LIST_INSERT_HEAD(&pm->bitmap_list, bm, entry);
464                               mutex_exit(&pm->list_lock);
465                     } else {
466                               /* Race: use an existing bitmap. */
467                               kmem_free(bm, sizeof(bitmap_t));
468                               bm = ret;
469                     }
470           }
471           return bm;
472 }
473 
474 /*
475  * npf_portmap_flush: free all bitmaps and remove all addresses.
476  *
477  * => Concurrent calls to this routine are not allowed; therefore no
478  * need to acquire locks.
479  */
480 void
npf_portmap_flush(npf_portmap_t * pm)481 npf_portmap_flush(npf_portmap_t *pm)
482 {
483           bitmap_t *bm;
484 
485           while ((bm = LIST_FIRST(&pm->bitmap_list)) != NULL) {
486                     for (unsigned i = 0; i < PORTMAP_L0_WORDS; i++) {
487                               uintptr_t bm1 = bm->bits0[i];
488 
489                               if (bm1 & PORTMAP_L1_TAG) {
490                                         bitmap_l1_t *bm1p = PORTMAP_L1_GET(bm1);
491                                         kmem_intr_free(bm1p, sizeof(bitmap_l1_t));
492                               }
493                               bm->bits0[i] = UINT64_C(0);
494                     }
495                     LIST_REMOVE(bm, entry);
496                     thmap_del(pm->addr_map, &bm->addr, bm->addr_len);
497                     kmem_intr_free(bm, sizeof(bitmap_t));
498           }
499           /* Note: the caller ensures there are no active references. */
500           thmap_gc(pm->addr_map, thmap_stage_gc(pm->addr_map));
501 }
502 
503 /*
504  * npf_portmap_get: allocate and return a port from the given portmap.
505  *
506  * => Returns the port value in network byte-order.
507  * => Zero indicates a failure.
508  */
509 in_port_t
npf_portmap_get(npf_portmap_t * pm,int alen,const npf_addr_t * addr)510 npf_portmap_get(npf_portmap_t *pm, int alen, const npf_addr_t *addr)
511 {
512           const unsigned min_port = atomic_load_relaxed(&pm->min_port);
513           const unsigned max_port = atomic_load_relaxed(&pm->max_port);
514           const unsigned port_delta = max_port - min_port + 1;
515           unsigned bit, target;
516           bitmap_t *bm;
517 
518           /* Sanity check: the user might set incorrect parameters. */
519           if (__predict_false(min_port > max_port)) {
520                     return 0;
521           }
522 
523           bm = npf_portmap_autoget(pm, alen, addr);
524           if (__predict_false(bm == NULL)) {
525                     /* No memory. */
526                     return 0;
527           }
528 
529           /* Randomly select a port. */
530           target = min_port + (cprng_fast32() % port_delta);
531           bit = target;
532 next:
533           if (bitmap_set(bm, bit)) {
534                     /* Success. */
535                     return htons(bit);
536           }
537           bit = min_port + ((bit + 1) % port_delta);
538           if (target != bit) {
539                     /* Next.. */
540                     goto next;
541           }
542           /* No space. */
543           return 0;
544 }
545 
546 /*
547  * npf_portmap_take: allocate a specific port in the portmap.
548  */
549 bool
npf_portmap_take(npf_portmap_t * pm,int alen,const npf_addr_t * addr,in_port_t port)550 npf_portmap_take(npf_portmap_t *pm, int alen,
551     const npf_addr_t *addr, in_port_t port)
552 {
553           bitmap_t *bm = npf_portmap_autoget(pm, alen, addr);
554 
555           port = ntohs(port);
556           if (!bm || port < pm->min_port || port > pm->max_port) {
557                     /* Out of memory / invalid port. */
558                     return false;
559           }
560           return bitmap_set(bm, port);
561 }
562 
563 /*
564  * npf_portmap_put: release the port, making it available in the portmap.
565  *
566  * => The port value should be in network byte-order.
567  */
568 void
npf_portmap_put(npf_portmap_t * pm,int alen,const npf_addr_t * addr,in_port_t port)569 npf_portmap_put(npf_portmap_t *pm, int alen,
570     const npf_addr_t *addr, in_port_t port)
571 {
572           bitmap_t *bm;
573 
574           bm = npf_portmap_autoget(pm, alen, addr);
575           if (bm) {
576                     port = ntohs(port);
577                     bitmap_clr(bm, port);
578           }
579 }
580