xref: /dragonfly/sys/net/pf/pf_norm.c (revision b272101acc636ac635f83d03265ef6a44a3ba51a)
1 /*        $OpenBSD: pf_norm.c,v 1.113 2008/05/07 07:07:29 markus Exp $ */
2 
3 /*
4  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
5  *
6  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/mbuf.h>
36 #include <sys/filio.h>
37 #include <sys/fcntl.h>
38 #include <sys/malloc.h>
39 #include <sys/socket.h>
40 #include <sys/kernel.h>
41 #include <sys/time.h>
42 
43 #include <net/if.h>
44 #include <net/if_var.h>
45 #include <net/if_types.h>
46 #include <net/bpf.h>
47 #include <net/route.h>
48 #include <net/pf/if_pflog.h>
49 
50 #include <netinet/in.h>
51 #include <netinet/in_var.h>
52 #include <netinet/in_systm.h>
53 #include <netinet/ip.h>
54 #include <netinet/ip_var.h>
55 #include <netinet/tcp.h>
56 #include <netinet/tcp_seq.h>
57 #include <netinet/udp.h>
58 #include <netinet/ip_icmp.h>
59 
60 #ifdef INET6
61 #include <netinet/ip6.h>
62 #endif /* INET6 */
63 
64 #include <net/pf/pfvar.h>
65 
66 #define PFFRAG_SEENLAST       0x0001              /* Seen the last fragment for this */
67 #define PFFRAG_NOBUFFER       0x0002              /* Non-buffering fragment cache */
68 #define PFFRAG_DROP 0x0004              /* Drop all fragments */
69 #define BUFFER_FRAGMENTS(fr)  (!((fr)->fr_flags & PFFRAG_NOBUFFER))
70 
71 
72 TAILQ_HEAD(pf_fragqueue, pf_fragment)   *pf_fragqueue;
73 TAILQ_HEAD(pf_cachequeue, pf_fragment)  *pf_cachequeue;
74 
75 static __inline int  pf_frag_compare(struct pf_fragment *,
76                                   struct pf_fragment *);
77 RB_HEAD(pf_frag_tree, pf_fragment)      *pf_frag_tree,
78                                                   *pf_cache_tree;
79 RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
80 RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
81 
82 /* Private prototypes */
83 void                           pf_ip2key(struct pf_fragment *, struct ip *);
84 void                           pf_remove_fragment(struct pf_fragment *);
85 void                           pf_flush_fragments(void);
86 void                           pf_free_fragment(struct pf_fragment *);
87 struct pf_fragment  *pf_find_fragment(struct ip *, struct pf_frag_tree *);
88 struct mbuf                   *pf_reassemble(struct mbuf **, struct pf_fragment **,
89                                   struct pf_frent *, int);
90 struct mbuf                   *pf_fragcache(struct mbuf **, struct ip*,
91                                   struct pf_fragment **, int, int, int *);
92 int                            pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
93                                   struct tcphdr *, int, sa_family_t);
94 
95 #define   DPFPRINTF(x) do {                                 \
96           if (pf_status.debug >= PF_DEBUG_MISC) {           \
97                     kprintf("%s: ", __func__);              \
98                     kprintf x ;                                       \
99           }                                                           \
100 } while(0)
101 
102 static MALLOC_DEFINE(M_PFFRAGPL, "pffrag", "pf fragment pool list");
103 static MALLOC_DEFINE(M_PFCACHEPL, "pffrcache", "pf fragment cache pool list");
104 static MALLOC_DEFINE(M_PFFRENTPL, "pffrent", "pf frent pool list");
105 static MALLOC_DEFINE(M_PFCENTPL, "pffrcent", "pf fragment cent pool list");
106 static MALLOC_DEFINE(M_PFSTATESCRUBPL, "pfstatescrub", "pf state scrub pool list");
107 
108 /* Globals */
109 struct malloc_type   *pf_frent_pl, *pf_frag_pl, *pf_cache_pl, *pf_cent_pl;
110 struct malloc_type   *pf_state_scrub_pl;
111 int                            pf_nfrents, pf_ncache;
112 
113 void
pf_normalize_init(void)114 pf_normalize_init(void)
115 {
116           int n;
117 
118           /* XXX
119           pool_sethiwat(&pf_frag_pl, PFFRAG_FRAG_HIWAT);
120           pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0);
121           pool_sethardlimit(&pf_cache_pl, PFFRAG_FRCACHE_HIWAT, NULL, 0);
122           pool_sethardlimit(&pf_cent_pl, PFFRAG_FRCENT_HIWAT, NULL, 0);
123           */
124 
125           /*
126            * pcpu queues and trees
127            */
128           pf_fragqueue = kmalloc(sizeof(*pf_fragqueue) * ncpus,
129                                         M_PF, M_WAITOK | M_ZERO);
130           pf_cachequeue = kmalloc(sizeof(*pf_cachequeue) * ncpus,
131                                         M_PF, M_WAITOK | M_ZERO);
132           pf_frag_tree = kmalloc(sizeof(*pf_frag_tree) * ncpus,
133                                         M_PF, M_WAITOK | M_ZERO);
134           pf_cache_tree = kmalloc(sizeof(*pf_cache_tree) * ncpus,
135                                         M_PF, M_WAITOK | M_ZERO);
136 
137           for (n = 0; n < ncpus; ++n) {
138                     TAILQ_INIT(&pf_fragqueue[n]);
139                     TAILQ_INIT(&pf_cachequeue[n]);
140                     RB_INIT(&pf_frag_tree[n]);
141                     RB_INIT(&pf_cache_tree[n]);
142           }
143 }
144 
145 void
pf_normalize_unload(void)146 pf_normalize_unload(void)
147 {
148           kfree(pf_fragqueue, M_PF);
149           kfree(pf_cachequeue, M_PF);
150           kfree(pf_frag_tree, M_PF);
151           kfree(pf_cache_tree, M_PF);
152 }
153 
154 static __inline int
pf_frag_compare(struct pf_fragment * a,struct pf_fragment * b)155 pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
156 {
157           int       diff;
158 
159           if ((diff = a->fr_id - b->fr_id))
160                     return (diff);
161           else if ((diff = a->fr_p - b->fr_p))
162                     return (diff);
163           else if (a->fr_src.s_addr < b->fr_src.s_addr)
164                     return (-1);
165           else if (a->fr_src.s_addr > b->fr_src.s_addr)
166                     return (1);
167           else if (a->fr_dst.s_addr < b->fr_dst.s_addr)
168                     return (-1);
169           else if (a->fr_dst.s_addr > b->fr_dst.s_addr)
170                     return (1);
171           return (0);
172 }
173 
174 void
pf_purge_expired_fragments(void)175 pf_purge_expired_fragments(void)
176 {
177           struct pf_fragment *frag;
178           u_int32_t expire;
179           int cpu = mycpu->gd_cpuid;
180 
181           expire = time_second - pf_default_rule.timeout[PFTM_FRAG];
182 
183           while ((frag = TAILQ_LAST(&pf_fragqueue[cpu], pf_fragqueue)) != NULL) {
184                     KASSERT((BUFFER_FRAGMENTS(frag)),
185                               ("BUFFER_FRAGMENTS(frag) == 0: %s", __func__));
186                     if (frag->fr_timeout > expire)
187                               break;
188 
189                     DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
190                     pf_free_fragment(frag);
191           }
192 
193           while ((frag = TAILQ_LAST(&pf_cachequeue[cpu], pf_cachequeue)) != NULL) {
194                     KASSERT((!BUFFER_FRAGMENTS(frag)),
195                               ("BUFFER_FRAGMENTS(frag) != 0: %s", __func__));
196                     if (frag->fr_timeout > expire)
197                               break;
198 
199                     DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
200                     pf_free_fragment(frag);
201                     KASSERT((TAILQ_EMPTY(&pf_cachequeue[cpu]) ||
202                         TAILQ_LAST(&pf_cachequeue[cpu], pf_cachequeue) != frag),
203                         ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s",
204                         __func__));
205           }
206 }
207 
208 /*
209  * Try to flush old fragments to make space for new ones
210  */
211 
212 void
pf_flush_fragments(void)213 pf_flush_fragments(void)
214 {
215           struct pf_fragment *frag;
216           int goal;
217           int cpu = mycpu->gd_cpuid;
218 
219           goal = pf_nfrents * 9 / 10;
220           DPFPRINTF(("trying to free > %d frents\n",
221               pf_nfrents - goal));
222           while (goal < pf_nfrents) {
223                     frag = TAILQ_LAST(&pf_fragqueue[cpu], pf_fragqueue);
224                     if (frag == NULL)
225                               break;
226                     pf_free_fragment(frag);
227           }
228 
229 
230           goal = pf_ncache * 9 / 10;
231           DPFPRINTF(("trying to free > %d cache entries\n",
232               pf_ncache - goal));
233           while (goal < pf_ncache) {
234                     frag = TAILQ_LAST(&pf_cachequeue[cpu], pf_cachequeue);
235                     if (frag == NULL)
236                               break;
237                     pf_free_fragment(frag);
238           }
239 }
240 
241 /* Frees the fragments and all associated entries */
242 
243 void
pf_free_fragment(struct pf_fragment * frag)244 pf_free_fragment(struct pf_fragment *frag)
245 {
246           struct pf_frent               *frent;
247           struct pf_frcache   *frcache;
248 
249           /* Free all fragments */
250           if (BUFFER_FRAGMENTS(frag)) {
251                     for (frent = LIST_FIRST(&frag->fr_queue); frent;
252                         frent = LIST_FIRST(&frag->fr_queue)) {
253                               LIST_REMOVE(frent, fr_next);
254 
255                               m_freem(frent->fr_m);
256                               kfree(frent, M_PFFRENTPL);
257                               pf_nfrents--;
258                     }
259           } else {
260                     for (frcache = LIST_FIRST(&frag->fr_cache); frcache;
261                         frcache = LIST_FIRST(&frag->fr_cache)) {
262                               LIST_REMOVE(frcache, fr_next);
263 
264                               KASSERT((LIST_EMPTY(&frag->fr_cache) ||
265                                   LIST_FIRST(&frag->fr_cache)->fr_off >
266                                   frcache->fr_end),
267                                   ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >"
268                              " frcache->fr_end): %s", __func__));
269 
270                               kfree(frcache, M_PFCENTPL);
271                               pf_ncache--;
272                     }
273           }
274 
275           pf_remove_fragment(frag);
276 }
277 
278 void
pf_ip2key(struct pf_fragment * key,struct ip * ip)279 pf_ip2key(struct pf_fragment *key, struct ip *ip)
280 {
281           key->fr_p = ip->ip_p;
282           key->fr_id = ip->ip_id;
283           key->fr_src.s_addr = ip->ip_src.s_addr;
284           key->fr_dst.s_addr = ip->ip_dst.s_addr;
285 }
286 
287 struct pf_fragment *
pf_find_fragment(struct ip * ip,struct pf_frag_tree * tree)288 pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree)
289 {
290           struct pf_fragment   key;
291           struct pf_fragment  *frag;
292           int cpu = mycpu->gd_cpuid;
293 
294           pf_ip2key(&key, ip);
295 
296           frag = RB_FIND(pf_frag_tree, tree, &key);
297           if (frag != NULL) {
298                     /* XXX Are we sure we want to update the timeout? */
299                     frag->fr_timeout = time_second;
300                     if (BUFFER_FRAGMENTS(frag)) {
301                               TAILQ_REMOVE(&pf_fragqueue[cpu], frag, frag_next);
302                               TAILQ_INSERT_HEAD(&pf_fragqueue[cpu], frag, frag_next);
303                     } else {
304                               TAILQ_REMOVE(&pf_cachequeue[cpu], frag, frag_next);
305                               TAILQ_INSERT_HEAD(&pf_cachequeue[cpu], frag, frag_next);
306                     }
307           }
308 
309           return (frag);
310 }
311 
312 /* Removes a fragment from the fragment queue and frees the fragment */
313 
314 void
pf_remove_fragment(struct pf_fragment * frag)315 pf_remove_fragment(struct pf_fragment *frag)
316 {
317           int cpu = mycpu->gd_cpuid;
318 
319           if (BUFFER_FRAGMENTS(frag)) {
320                     RB_REMOVE(pf_frag_tree, &pf_frag_tree[cpu], frag);
321                     TAILQ_REMOVE(&pf_fragqueue[cpu], frag, frag_next);
322                     kfree(frag, M_PFFRAGPL);
323           } else {
324                     RB_REMOVE(pf_frag_tree, &pf_cache_tree[cpu], frag);
325                     TAILQ_REMOVE(&pf_cachequeue[cpu], frag, frag_next);
326                     kfree(frag, M_PFCACHEPL);
327           }
328 }
329 
330 #define FR_IP_OFF(fr)         ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
331 
332 struct mbuf *
pf_reassemble(struct mbuf ** m0,struct pf_fragment ** frag,struct pf_frent * frent,int mff)333 pf_reassemble(struct mbuf **m0, struct pf_fragment **frag,
334     struct pf_frent *frent, int mff)
335 {
336           struct mbuf         *m = *m0, *m2;
337           struct pf_frent     *frea, *next;
338           struct pf_frent     *frep = NULL;
339           struct ip *ip = frent->fr_ip;
340           int                 hlen = ip->ip_hl << 2;
341           u_int16_t off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
342           u_int16_t ip_len = ntohs(ip->ip_len) - hlen;
343           u_int16_t max = ip_len + off;
344           int                 cpu = mycpu->gd_cpuid;
345 
346           KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)),
347               ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __func__));
348 
349           /* Strip off ip header */
350           m->m_data += hlen;
351           m->m_len -= hlen;
352 
353           /* Create a new reassembly queue for this packet */
354           if (*frag == NULL) {
355                     *frag = kmalloc(sizeof(struct pf_fragment),
356                                         M_PFFRAGPL, M_NOWAIT);
357                     if (*frag == NULL) {
358                               pf_flush_fragments();
359                               *frag = kmalloc(sizeof(struct pf_fragment),
360                                                   M_PFFRAGPL, M_NOWAIT);
361                               if (*frag == NULL)
362                                         goto drop_fragment;
363                     }
364 
365                     (*frag)->fr_flags = 0;
366                     (*frag)->fr_max = 0;
367                     (*frag)->fr_src = frent->fr_ip->ip_src;
368                     (*frag)->fr_dst = frent->fr_ip->ip_dst;
369                     (*frag)->fr_p = frent->fr_ip->ip_p;
370                     (*frag)->fr_id = frent->fr_ip->ip_id;
371                     (*frag)->fr_timeout = time_second;
372                     LIST_INIT(&(*frag)->fr_queue);
373 
374                     RB_INSERT(pf_frag_tree, &pf_frag_tree[cpu], *frag);
375                     TAILQ_INSERT_HEAD(&pf_fragqueue[cpu], *frag, frag_next);
376 
377                     /* We do not have a previous fragment */
378                     frep = NULL;
379                     goto insert;
380           }
381 
382           /*
383            * Find a fragment after the current one:
384            *  - off contains the real shifted offset.
385            */
386           LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
387                     if (FR_IP_OFF(frea) > off)
388                               break;
389                     frep = frea;
390           }
391 
392           KASSERT((frep != NULL || frea != NULL),
393               ("!(frep != NULL || frea != NULL): %s", __func__));
394 
395           /*
396            * Merge with previous fragment by cutting the start of
397            * the current packet.
398            */
399           if (frep != NULL &&
400               (FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) -
401                frep->fr_ip->ip_hl * 4) > off)
402           {
403                     u_int16_t precut;
404 
405                     precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) -
406                                frep->fr_ip->ip_hl * 4 - off;
407                     if (precut >= ip_len)
408                               goto drop_fragment;
409                     m_adj(frent->fr_m, precut);
410                     DPFPRINTF(("overlap -%d\n", precut));
411                     /* Enforce 8 byte boundaries */
412                     ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3));
413                     off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
414                     ip_len -= precut;
415                     ip->ip_len = htons(ip_len + hlen);
416           }
417 
418           /*
419            * Cut or delete overlapping later fragments.
420            */
421           for (; frea != NULL && ip_len + off > FR_IP_OFF(frea);
422               frea = next)
423           {
424                     u_int16_t aftercut;
425 
426                     aftercut = ip_len + off - FR_IP_OFF(frea);
427                     DPFPRINTF(("adjust overlap %d\n", aftercut));
428                     if (aftercut < (ntohs(frea->fr_ip->ip_len) -
429                                         frea->fr_ip->ip_hl * 4))
430                     {
431                               frea->fr_ip->ip_len =
432                                   htons(ntohs(frea->fr_ip->ip_len) - aftercut +
433                                           frea->fr_ip->ip_hl * 4);
434                               frea->fr_ip->ip_off =
435                                   htons(ntohs(frea->fr_ip->ip_off) + (aftercut >> 3));
436                               m_adj(frea->fr_m, aftercut);
437                               break;
438                     }
439 
440                     /* This fragment is completely overlapped, lose it */
441                     next = LIST_NEXT(frea, fr_next);
442                     m_freem(frea->fr_m);
443                     LIST_REMOVE(frea, fr_next);
444                     kfree(frea, M_PFFRENTPL);
445                     pf_nfrents--;
446           }
447 
448  insert:
449           /* Update maximum data size */
450           if ((*frag)->fr_max < max)
451                     (*frag)->fr_max = max;
452           /* This is the last segment */
453           if (!mff)
454                     (*frag)->fr_flags |= PFFRAG_SEENLAST;
455 
456           if (frep == NULL)
457                     LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
458           else
459                     LIST_INSERT_AFTER(frep, frent, fr_next);
460 
461           /* Check if we are completely reassembled */
462           if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
463                     return (NULL);
464 
465           /* Check if we have all the data */
466           off = 0;
467           for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
468                     next = LIST_NEXT(frep, fr_next);
469 
470                     off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4;
471                     if (off < (*frag)->fr_max &&
472                         (next == NULL || FR_IP_OFF(next) != off))
473                     {
474                               DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
475                                   off, next == NULL ? -1 : FR_IP_OFF(next),
476                                   (*frag)->fr_max));
477                               return (NULL);
478                     }
479           }
480           DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
481           if (off < (*frag)->fr_max)
482                     return (NULL);
483 
484           /* We have all the data */
485           frent = LIST_FIRST(&(*frag)->fr_queue);
486           KASSERT((frent != NULL), ("frent == NULL: %s", __func__));
487           if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) {
488                     DPFPRINTF(("drop: too big: %d\n", off));
489                     pf_free_fragment(*frag);
490                     *frag = NULL;
491                     return (NULL);
492           }
493           next = LIST_NEXT(frent, fr_next);
494 
495           /* Magic from ip_input */
496           ip = frent->fr_ip;
497           m = frent->fr_m;
498           m2 = m->m_next;
499           m->m_next = NULL;
500           m_cat(m, m2);
501           kfree(frent, M_PFFRENTPL);
502           pf_nfrents--;
503           for (frent = next; frent != NULL; frent = next) {
504                     next = LIST_NEXT(frent, fr_next);
505 
506                     m2 = frent->fr_m;
507                     kfree(frent, M_PFFRENTPL);
508                     pf_nfrents--;
509                     m_cat(m, m2);
510           }
511 
512           ip->ip_src = (*frag)->fr_src;
513           ip->ip_dst = (*frag)->fr_dst;
514 
515           /* Remove from fragment queue */
516           pf_remove_fragment(*frag);
517           *frag = NULL;
518 
519           hlen = ip->ip_hl << 2;
520           ip->ip_len = htons(off + hlen);
521           ip->ip_off &= htons(IP_DF);
522           m->m_len += hlen;
523           m->m_data -= hlen;
524 
525           /* some debugging cruft by sklower, below, will go away soon */
526           /* XXX this should be done elsewhere */
527           if (m->m_flags & M_PKTHDR) {
528                     int plen = 0;
529                     for (m2 = m; m2; m2 = m2->m_next)
530                               plen += m2->m_len;
531                     m->m_pkthdr.len = plen;
532           }
533 
534 #if 0
535           kprintf("reassembly complete: len=%u\n", ntohs(ip->ip_len));
536           kprintf("ip_src=%08x dst=%08x tos=%u p=%u off=%u len=%u\n",
537                     ip->ip_src.s_addr, ip->ip_dst.s_addr, ip->ip_tos, ip->ip_p,
538                     ntohs(ip->ip_off), ntohs(ip->ip_len));
539 #endif
540 
541           DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
542           return (m);
543 
544  drop_fragment:
545           /* Oops - fail safe - drop packet */
546           kfree(frent, M_PFFRENTPL);
547           pf_nfrents--;
548           m_freem(m);
549           return (NULL);
550 }
551 
552 struct mbuf *
pf_fragcache(struct mbuf ** m0,struct ip * h,struct pf_fragment ** frag,int mff,int drop,int * nomem)553 pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
554     int drop, int *nomem)
555 {
556           struct mbuf         *m = *m0;
557           struct pf_frcache *frp, *fra, *cur = NULL;
558           int                 ip_len = ntohs(h->ip_len) - (h->ip_hl << 2);
559           u_int16_t off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
560           u_int16_t max = ip_len + off;
561           int                 hosed = 0;
562           int                 cpu = mycpu->gd_cpuid;
563 
564           KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)),
565               ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __func__));
566 
567           /* Create a new range queue for this packet */
568           if (*frag == NULL) {
569                     *frag = kmalloc(sizeof(struct pf_fragment), M_PFCACHEPL, M_NOWAIT);
570                     if (*frag == NULL) {
571                               pf_flush_fragments();
572                               *frag = kmalloc(sizeof(struct pf_fragment), M_PFCACHEPL, M_NOWAIT);
573                               if (*frag == NULL)
574                                         goto no_mem;
575                     }
576 
577                     /* Get an entry for the queue */
578                     cur = kmalloc(sizeof(struct pf_frcache), M_PFCENTPL, M_NOWAIT);
579                     if (cur == NULL) {
580                               kfree(*frag, M_PFCACHEPL);
581                               *frag = NULL;
582                               goto no_mem;
583                     }
584                     pf_ncache++;
585 
586                     (*frag)->fr_flags = PFFRAG_NOBUFFER;
587                     (*frag)->fr_max = 0;
588                     (*frag)->fr_src = h->ip_src;
589                     (*frag)->fr_dst = h->ip_dst;
590                     (*frag)->fr_p = h->ip_p;
591                     (*frag)->fr_id = h->ip_id;
592                     (*frag)->fr_timeout = time_second;
593 
594                     cur->fr_off = off;
595                     cur->fr_end = max;
596                     LIST_INIT(&(*frag)->fr_cache);
597                     LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next);
598 
599                     RB_INSERT(pf_frag_tree, &pf_cache_tree[cpu], *frag);
600                     TAILQ_INSERT_HEAD(&pf_cachequeue[cpu], *frag, frag_next);
601 
602                     DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max));
603 
604                     goto pass;
605           }
606 
607           /*
608            * Find a fragment after the current one:
609            *  - off contains the real shifted offset.
610            */
611           frp = NULL;
612           LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) {
613                     if (fra->fr_off > off)
614                               break;
615                     frp = fra;
616           }
617 
618           KASSERT((frp != NULL || fra != NULL),
619               ("!(frp != NULL || fra != NULL): %s", __func__));
620 
621           if (frp != NULL) {
622                     int       precut;
623 
624                     precut = frp->fr_end - off;
625                     if (precut >= ip_len) {
626                               /* Fragment is entirely a duplicate */
627                               DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
628                                   h->ip_id, frp->fr_off, frp->fr_end, off, max));
629                               goto drop_fragment;
630                     }
631                     if (precut == 0) {
632                               /* They are adjacent.  Fixup cache entry */
633                               DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
634                                   h->ip_id, frp->fr_off, frp->fr_end, off, max));
635                               frp->fr_end = max;
636                     } else if (precut > 0) {
637                               /* The first part of this payload overlaps with a
638                                * fragment that has already been passed.
639                                * Need to trim off the first part of the payload.
640                                * But to do so easily, we need to create another
641                                * mbuf to throw the original header into.
642                                */
643 
644                               DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
645                                   h->ip_id, precut, frp->fr_off, frp->fr_end, off,
646                                   max));
647 
648                               off += precut;
649                               max -= precut;
650                               /* Update the previous frag to encompass this one */
651                               frp->fr_end = max;
652 
653                               if (!drop) {
654                                         /* XXX Optimization opportunity
655                                          * This is a very heavy way to trim the payload.
656                                          * we could do it much faster by diddling mbuf
657                                          * internals but that would be even less legible
658                                          * than this mbuf magic.  For my next trick,
659                                          * I'll pull a rabbit out of my laptop.
660                                          */
661                                         *m0 = m_dup(m, M_NOWAIT);
662                                         /* From KAME Project : We have missed this! */
663                                         m_adj(*m0, (h->ip_hl << 2) -
664                                             (*m0)->m_pkthdr.len);
665                                         if (*m0 == NULL)
666                                                   goto no_mem;
667                                         KASSERT(((*m0)->m_next == NULL),
668                                             ("(*m0)->m_next != NULL: %s",
669                                             __func__));
670                                         m_adj(m, precut + (h->ip_hl << 2));
671                                         m_cat(*m0, m);
672                                         m = *m0;
673                                         if (m->m_flags & M_PKTHDR) {
674                                                   int plen = 0;
675                                                   struct mbuf *t;
676                                                   for (t = m; t; t = t->m_next)
677                                                             plen += t->m_len;
678                                                   m->m_pkthdr.len = plen;
679                                         }
680 
681 
682                                         h = mtod(m, struct ip *);
683 
684                                         KASSERT(((int)m->m_len ==
685                                             ntohs(h->ip_len) - precut),
686                                             ("m->m_len != h->ip_len - precut: %s",
687                                             __func__));
688                                         h->ip_off = htons(ntohs(h->ip_off) +
689                                                               (precut >> 3));
690                                         h->ip_len = htons(ntohs(h->ip_len) - precut);
691                               } else {
692                                         hosed++;
693                               }
694                     } else {
695                               /* There is a gap between fragments */
696 
697                               DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
698                                   h->ip_id, -precut, frp->fr_off, frp->fr_end, off,
699                                   max));
700 
701                               cur = kmalloc(sizeof(struct pf_frcache), M_PFCENTPL, M_NOWAIT);
702                               if (cur == NULL)
703                                         goto no_mem;
704                               pf_ncache++;
705 
706                               cur->fr_off = off;
707                               cur->fr_end = max;
708                               LIST_INSERT_AFTER(frp, cur, fr_next);
709                     }
710           }
711 
712           if (fra != NULL) {
713                     int       aftercut;
714                     int       merge = 0;
715 
716                     aftercut = max - fra->fr_off;
717                     if (aftercut == 0) {
718                               /* Adjacent fragments */
719                               DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
720                                   h->ip_id, off, max, fra->fr_off, fra->fr_end));
721                               fra->fr_off = off;
722                               merge = 1;
723                     } else if (aftercut > 0) {
724                               /* Need to chop off the tail of this fragment */
725                               DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
726                                   h->ip_id, aftercut, off, max, fra->fr_off,
727                                   fra->fr_end));
728                               fra->fr_off = off;
729                               max -= aftercut;
730 
731                               merge = 1;
732 
733                               if (!drop) {
734                                         m_adj(m, -aftercut);
735                                         if (m->m_flags & M_PKTHDR) {
736                                                   int plen = 0;
737                                                   struct mbuf *t;
738                                                   for (t = m; t; t = t->m_next)
739                                                             plen += t->m_len;
740                                                   m->m_pkthdr.len = plen;
741                                         }
742                                         h = mtod(m, struct ip *);
743                                         KASSERT(((int)m->m_len ==
744                                                    ntohs(h->ip_len) - aftercut),
745                                             ("m->m_len != h->ip_len - aftercut: %s",
746                                             __func__));
747                                         h->ip_len = htons(ntohs(h->ip_len) - aftercut);
748                               } else {
749                                         hosed++;
750                               }
751                     } else if (frp == NULL) {
752                               /* There is a gap between fragments */
753                               DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
754                                   h->ip_id, -aftercut, off, max, fra->fr_off,
755                                   fra->fr_end));
756 
757                               cur = kmalloc(sizeof(struct pf_frcache), M_PFCENTPL, M_NOWAIT);
758                               if (cur == NULL)
759                                         goto no_mem;
760                               pf_ncache++;
761 
762                               cur->fr_off = off;
763                               cur->fr_end = max;
764                               LIST_INSERT_BEFORE(fra, cur, fr_next);
765                     }
766 
767 
768                     /* Need to glue together two separate fragment descriptors */
769                     if (merge) {
770                               if (cur && fra->fr_off <= cur->fr_end) {
771                                         /* Need to merge in a previous 'cur' */
772                                         DPFPRINTF(("fragcache[%d]: adjacent(merge "
773                                             "%d-%d) %d-%d (%d-%d)\n",
774                                             h->ip_id, cur->fr_off, cur->fr_end, off,
775                                             max, fra->fr_off, fra->fr_end));
776                                         fra->fr_off = cur->fr_off;
777                                         LIST_REMOVE(cur, fr_next);
778                                         kfree(cur, M_PFCENTPL);
779                                         pf_ncache--;
780                                         cur = NULL;
781 
782                               } else if (frp && fra->fr_off <= frp->fr_end) {
783                                         /* Need to merge in a modified 'frp' */
784                                         KASSERT((cur == NULL), ("cur != NULL: %s",
785                                             __func__));
786                                         DPFPRINTF(("fragcache[%d]: adjacent(merge "
787                                             "%d-%d) %d-%d (%d-%d)\n",
788                                             h->ip_id, frp->fr_off, frp->fr_end, off,
789                                             max, fra->fr_off, fra->fr_end));
790                                         fra->fr_off = frp->fr_off;
791                                         LIST_REMOVE(frp, fr_next);
792                                         kfree(frp, M_PFCENTPL);
793                                         pf_ncache--;
794                                         frp = NULL;
795 
796                               }
797                     }
798           }
799 
800           if (hosed) {
801                     /*
802                      * We must keep tracking the overall fragment even when
803                      * we're going to drop it anyway so that we know when to
804                      * free the overall descriptor.  Thus we drop the frag late.
805                      */
806                     goto drop_fragment;
807           }
808 
809 
810  pass:
811           /* Update maximum data size */
812           if ((*frag)->fr_max < max)
813                     (*frag)->fr_max = max;
814 
815           /* This is the last segment */
816           if (!mff)
817                     (*frag)->fr_flags |= PFFRAG_SEENLAST;
818 
819           /* Check if we are completely reassembled */
820           if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
821               LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 &&
822               LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) {
823                     /* Remove from fragment queue */
824                     DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
825                         (*frag)->fr_max));
826                     pf_free_fragment(*frag);
827                     *frag = NULL;
828           }
829 
830           return (m);
831 
832  no_mem:
833           *nomem = 1;
834 
835           /* Still need to pay attention to !IP_MF */
836           if (!mff && *frag != NULL)
837                     (*frag)->fr_flags |= PFFRAG_SEENLAST;
838 
839           m_freem(m);
840           return (NULL);
841 
842  drop_fragment:
843 
844           /* Still need to pay attention to !IP_MF */
845           if (!mff && *frag != NULL)
846                     (*frag)->fr_flags |= PFFRAG_SEENLAST;
847 
848           if (drop) {
849                     /* This fragment has been deemed bad.  Don't reass */
850                     if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
851                               DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
852                                   h->ip_id));
853                     (*frag)->fr_flags |= PFFRAG_DROP;
854           }
855 
856           m_freem(m);
857           return (NULL);
858 }
859 
860 int
pf_normalize_ip(struct mbuf ** m0,int dir,struct pfi_kif * kif,u_short * reason,struct pf_pdesc * pd)861 pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
862     struct pf_pdesc *pd)
863 {
864           struct mbuf         *m = *m0;
865           struct pf_rule      *r;
866           struct pf_frent     *frent;
867           struct pf_fragment *frag = NULL;
868           struct ip *h = mtod(m, struct ip *);
869           int                 mff = (h->ip_off & htons(IP_MF));
870           int                 hlen = h->ip_hl << 2;
871           u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
872           u_int16_t max;
873           int                 ip_len;
874           int                 tag = -1;
875           int                 cpu = mycpu->gd_cpuid;
876 
877           r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
878           while (r != NULL) {
879                     r->evaluations++;
880                     if (pfi_kif_match(r->kif, kif) == r->ifnot)
881                               r = r->skip[PF_SKIP_IFP].ptr;
882                     else if (r->direction && r->direction != dir)
883                               r = r->skip[PF_SKIP_DIR].ptr;
884                     else if (r->af && r->af != AF_INET)
885                               r = r->skip[PF_SKIP_AF].ptr;
886                     else if (r->proto && r->proto != h->ip_p)
887                               r = r->skip[PF_SKIP_PROTO].ptr;
888                     else if (PF_MISMATCHAW(&r->src.addr,
889                         (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
890                         r->src.neg, kif))
891                               r = r->skip[PF_SKIP_SRC_ADDR].ptr;
892                     else if (PF_MISMATCHAW(&r->dst.addr,
893                         (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
894                         r->dst.neg, NULL))
895                               r = r->skip[PF_SKIP_DST_ADDR].ptr;
896                     else if (r->match_tag && !pf_match_tag(m, r, &tag))
897                               r = TAILQ_NEXT(r, entries);
898                     else
899                               break;
900           }
901 
902           if (r == NULL || r->action == PF_NOSCRUB)
903                     return (PF_PASS);
904           else {
905                     r->packets[dir == PF_OUT]++;
906                     r->bytes[dir == PF_OUT] += pd->tot_len;
907           }
908 
909           /* Check for illegal packets */
910           if (hlen < (int)sizeof(struct ip))
911                     goto drop;
912 
913           if (hlen > ntohs(h->ip_len))
914                     goto drop;
915 
916           /* Clear IP_DF if the rule uses the no-df option */
917           if ((r->rule_flag & PFRULE_NODF) && (h->ip_off & htons(IP_DF))) {
918                     u_int16_t ip_off = h->ip_off; /* network byte order */
919 
920                     h->ip_off &= ~htons(IP_DF);
921                     h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
922           }
923 
924           /* We will need other tests here */
925           if (!fragoff && !mff)
926                     goto no_fragment;
927 
928           /* A fragment; rehash required. */
929           m->m_flags &= ~M_HASH;
930 
931           /* We're dealing with a fragment now. Don't allow fragments
932            * with IP_DF to enter the cache. If the flag was cleared by
933            * no-df above, fine. Otherwise drop it.
934            */
935           if (h->ip_off & htons(IP_DF)) {
936                     DPFPRINTF(("IP_DF\n"));
937                     goto bad;
938           }
939 
940           ip_len = ntohs(h->ip_len) - hlen;
941 
942           /* All fragments are 8 byte aligned */
943           if (mff && (ip_len & 0x7)) {
944                     DPFPRINTF(("mff and %d\n", ip_len));
945                     goto bad;
946           }
947 
948           /* Respect maximum length */
949           if (fragoff + ip_len > IP_MAXPACKET) {
950                     DPFPRINTF(("max packet %d\n", fragoff + ip_len));
951                     goto bad;
952           }
953           max = fragoff + ip_len;
954 
955           if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
956                     /* Fully buffer all of the fragments */
957 
958                     frag = pf_find_fragment(h, &pf_frag_tree[cpu]);
959 
960                     /* Check if we saw the last fragment already */
961                     if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
962                         max > frag->fr_max)
963                               goto bad;
964 
965                     /* Get an entry for the fragment queue */
966                     frent = kmalloc(sizeof(struct pf_frent), M_PFFRENTPL, M_NOWAIT);
967                     if (frent == NULL) {
968                               REASON_SET(reason, PFRES_MEMORY);
969                               return (PF_DROP);
970                     }
971                     pf_nfrents++;
972                     frent->fr_ip = h;
973                     frent->fr_m = m;
974 
975                     /* Might return a completely reassembled mbuf, or NULL */
976                     DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
977                     *m0 = m = pf_reassemble(m0, &frag, frent, mff);
978 
979                     if (m == NULL)
980                               return (PF_DROP);
981 
982                     if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
983                               goto drop;
984 
985                     h = mtod(m, struct ip *);
986           } else {
987                     /* non-buffering fragment cache (drops or masks overlaps) */
988                     int       nomem = 0;
989 
990                     if (dir == PF_OUT && m->m_pkthdr.pf.flags & PF_TAG_FRAGCACHE) {
991                               /*
992                                * Already passed the fragment cache in the
993                                * input direction.  If we continued, it would
994                                * appear to be a dup and would be dropped.
995                                */
996                               goto fragment_pass;
997                     }
998 
999                     frag = pf_find_fragment(h, &pf_cache_tree[cpu]);
1000 
1001                     /* Check if we saw the last fragment already */
1002                     if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1003                         max > frag->fr_max) {
1004                               if (r->rule_flag & PFRULE_FRAGDROP)
1005                                         frag->fr_flags |= PFFRAG_DROP;
1006                               goto bad;
1007                     }
1008 
1009                     *m0 = m = pf_fragcache(m0, h, &frag, mff,
1010                         (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
1011                     if (m == NULL) {
1012                               if (nomem)
1013                                         goto no_mem;
1014                               goto drop;
1015                     }
1016 
1017                     if (dir == PF_IN)
1018                               m->m_pkthdr.pf.flags |= PF_TAG_FRAGCACHE;
1019 
1020                     if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1021                               goto drop;
1022                     goto fragment_pass;
1023           }
1024 
1025  no_fragment:
1026           /* At this point, only IP_DF is allowed in ip_off */
1027           if (h->ip_off & ~htons(IP_DF)) {
1028                     u_int16_t ip_off = h->ip_off; /* network byte order */
1029 
1030                     h->ip_off &= htons(IP_DF);
1031                     h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
1032           }
1033 
1034           /* Enforce a minimum ttl, may cause endless packet loops */
1035           if (r->min_ttl && h->ip_ttl < r->min_ttl) {
1036                     u_int16_t ip_ttl = h->ip_ttl;
1037 
1038                     h->ip_ttl = r->min_ttl;
1039                     h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1040           }
1041 
1042           /* Enforce tos */
1043           if (r->rule_flag & PFRULE_SET_TOS) {
1044                     u_int16_t ov, nv;
1045 
1046                     ov = *(u_int16_t *)h;
1047                     h->ip_tos = r->set_tos;
1048                     nv = *(u_int16_t *)h;
1049 
1050                     h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
1051           }
1052 
1053           if (r->rule_flag & PFRULE_RANDOMID) {
1054                     u_int16_t ip_id = h->ip_id;
1055 
1056                     h->ip_id = ip_randomid();
1057                     h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
1058           }
1059           if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1060                     pd->flags |= PFDESC_IP_REAS;
1061 
1062           return (PF_PASS);
1063 
1064  fragment_pass:
1065           /* Enforce a minimum ttl, may cause endless packet loops */
1066           if (r->min_ttl && h->ip_ttl < r->min_ttl) {
1067                     u_int16_t ip_ttl = h->ip_ttl;
1068 
1069                     h->ip_ttl = r->min_ttl;
1070                     h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1071           }
1072           /* Enforce tos */
1073           if (r->rule_flag & PFRULE_SET_TOS) {
1074                     u_int16_t ov, nv;
1075 
1076                     ov = *(u_int16_t *)h;
1077                     h->ip_tos = r->set_tos;
1078                     nv = *(u_int16_t *)h;
1079 
1080                     h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
1081           }
1082           if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1083                     pd->flags |= PFDESC_IP_REAS;
1084           return (PF_PASS);
1085 
1086  no_mem:
1087           REASON_SET(reason, PFRES_MEMORY);
1088           if (r != NULL && r->log)
1089                     PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1090           return (PF_DROP);
1091 
1092  drop:
1093           REASON_SET(reason, PFRES_NORM);
1094           if (r != NULL && r->log)
1095                     PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1096           return (PF_DROP);
1097 
1098  bad:
1099           DPFPRINTF(("dropping bad fragment\n"));
1100 
1101           /* Free associated fragments */
1102           if (frag != NULL)
1103                     pf_free_fragment(frag);
1104 
1105           REASON_SET(reason, PFRES_FRAG);
1106           if (r != NULL && r->log)
1107                     PFLOG_PACKET(kif, h, m, AF_INET, dir, *reason, r, NULL, NULL, pd);
1108 
1109           return (PF_DROP);
1110 }
1111 
1112 #ifdef INET6
1113 int
pf_normalize_ip6(struct mbuf ** m0,int dir,struct pfi_kif * kif,u_short * reason,struct pf_pdesc * pd)1114 pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
1115     u_short *reason, struct pf_pdesc *pd)
1116 {
1117           struct mbuf                   *m = *m0;
1118           struct pf_rule                *r;
1119           struct ip6_hdr                *h = mtod(m, struct ip6_hdr *);
1120           int                            off;
1121           struct ip6_ext                 ext;
1122           struct ip6_opt                 opt;
1123           struct ip6_opt_jumbo           jumbo;
1124           struct ip6_frag                frag;
1125           u_int32_t            jumbolen = 0, plen;
1126           u_int16_t            fragoff = 0;
1127           int                            optend;
1128           int                            ooff;
1129           u_int8_t             proto;
1130           int                            terminal;
1131 
1132           r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1133           while (r != NULL) {
1134                     r->evaluations++;
1135                     if (pfi_kif_match(r->kif, kif) == r->ifnot)
1136                               r = r->skip[PF_SKIP_IFP].ptr;
1137                     else if (r->direction && r->direction != dir)
1138                               r = r->skip[PF_SKIP_DIR].ptr;
1139                     else if (r->af && r->af != AF_INET6)
1140                               r = r->skip[PF_SKIP_AF].ptr;
1141 #if 0 /* header chain! */
1142                     else if (r->proto && r->proto != h->ip6_nxt)
1143                               r = r->skip[PF_SKIP_PROTO].ptr;
1144 #endif
1145                     else if (PF_MISMATCHAW(&r->src.addr,
1146                         (struct pf_addr *)&h->ip6_src, AF_INET6,
1147                         r->src.neg, kif))
1148                               r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1149                     else if (PF_MISMATCHAW(&r->dst.addr,
1150                         (struct pf_addr *)&h->ip6_dst, AF_INET6,
1151                         r->dst.neg, NULL))
1152                               r = r->skip[PF_SKIP_DST_ADDR].ptr;
1153                     else
1154                               break;
1155           }
1156 
1157           if (r == NULL || r->action == PF_NOSCRUB)
1158                     return (PF_PASS);
1159           else {
1160                     r->packets[dir == PF_OUT]++;
1161                     r->bytes[dir == PF_OUT] += pd->tot_len;
1162           }
1163 
1164           /* Check for illegal packets */
1165           if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
1166                     goto drop;
1167 
1168           off = sizeof(struct ip6_hdr);
1169           proto = h->ip6_nxt;
1170           terminal = 0;
1171           do {
1172                     switch (proto) {
1173                     case IPPROTO_FRAGMENT:
1174                               goto fragment;
1175                               break;
1176                     case IPPROTO_AH:
1177                     case IPPROTO_ROUTING:
1178                     case IPPROTO_DSTOPTS:
1179                               if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1180                                   NULL, AF_INET6))
1181                                         goto shortpkt;
1182                               if (proto == IPPROTO_AH)
1183                                         off += (ext.ip6e_len + 2) * 4;
1184                               else
1185                                         off += (ext.ip6e_len + 1) * 8;
1186                               proto = ext.ip6e_nxt;
1187                               break;
1188                     case IPPROTO_HOPOPTS:
1189                               if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1190                                   NULL, AF_INET6))
1191                                         goto shortpkt;
1192                               optend = off + (ext.ip6e_len + 1) * 8;
1193                               ooff = off + sizeof(ext);
1194                               do {
1195                                         if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
1196                                             sizeof(opt.ip6o_type), NULL, NULL,
1197                                             AF_INET6))
1198                                                   goto shortpkt;
1199                                         if (opt.ip6o_type == IP6OPT_PAD1) {
1200                                                   ooff++;
1201                                                   continue;
1202                                         }
1203                                         if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
1204                                             NULL, NULL, AF_INET6))
1205                                                   goto shortpkt;
1206                                         if (ooff + sizeof(opt) + opt.ip6o_len > optend)
1207                                                   goto drop;
1208                                         switch (opt.ip6o_type) {
1209                                         case IP6OPT_JUMBO:
1210                                                   if (h->ip6_plen != 0)
1211                                                             goto drop;
1212                                                   if (!pf_pull_hdr(m, ooff, &jumbo,
1213                                                       sizeof(jumbo), NULL, NULL,
1214                                                       AF_INET6))
1215                                                             goto shortpkt;
1216                                                   memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
1217                                                       sizeof(jumbolen));
1218                                                   jumbolen = ntohl(jumbolen);
1219                                                   if (jumbolen <= IPV6_MAXPACKET)
1220                                                             goto drop;
1221                                                   if (sizeof(struct ip6_hdr) + jumbolen !=
1222                                                       m->m_pkthdr.len)
1223                                                             goto drop;
1224                                                   break;
1225                                         default:
1226                                                   break;
1227                                         }
1228                                         ooff += sizeof(opt) + opt.ip6o_len;
1229                               } while (ooff < optend);
1230 
1231                               off = optend;
1232                               proto = ext.ip6e_nxt;
1233                               break;
1234                     default:
1235                               terminal = 1;
1236                               break;
1237                     }
1238           } while (!terminal);
1239 
1240           /* jumbo payload option must be present, or plen > 0 */
1241           if (ntohs(h->ip6_plen) == 0)
1242                     plen = jumbolen;
1243           else
1244                     plen = ntohs(h->ip6_plen);
1245           if (plen == 0)
1246                     goto drop;
1247           if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
1248                     goto shortpkt;
1249 
1250           /* Enforce a minimum ttl, may cause endless packet loops */
1251           if (r->min_ttl && h->ip6_hlim < r->min_ttl)
1252                     h->ip6_hlim = r->min_ttl;
1253 
1254           return (PF_PASS);
1255 
1256  fragment:
1257           if (ntohs(h->ip6_plen) == 0 || jumbolen)
1258                     goto drop;
1259           plen = ntohs(h->ip6_plen);
1260 
1261           if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
1262                     goto shortpkt;
1263           fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
1264           if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET)
1265                     goto badfrag;
1266 
1267           /* do something about it */
1268           /* remember to set pd->flags |= PFDESC_IP_REAS */
1269           return (PF_PASS);
1270 
1271  shortpkt:
1272           REASON_SET(reason, PFRES_SHORT);
1273           if (r != NULL && r->log)
1274                     PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1275           return (PF_DROP);
1276 
1277  drop:
1278           REASON_SET(reason, PFRES_NORM);
1279           if (r != NULL && r->log)
1280                     PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1281           return (PF_DROP);
1282 
1283  badfrag:
1284           REASON_SET(reason, PFRES_FRAG);
1285           if (r != NULL && r->log)
1286                     PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd);
1287           return (PF_DROP);
1288 }
1289 #endif /* INET6 */
1290 
1291 int
pf_normalize_tcp(int dir,struct pfi_kif * kif,struct mbuf * m,int ipoff,int off,void * h,struct pf_pdesc * pd)1292 pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
1293     int off, void *h, struct pf_pdesc *pd)
1294 {
1295           struct pf_rule      *r, *rm = NULL;
1296           struct tcphdr       *th = pd->hdr.tcp;
1297           int                  rewrite = 0;
1298           u_short              reason;
1299           u_int8_t   flags;
1300           sa_family_t          af = pd->af;
1301 
1302           r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1303           while (r != NULL) {
1304                     r->evaluations++;
1305                     if (pfi_kif_match(r->kif, kif) == r->ifnot)
1306                               r = r->skip[PF_SKIP_IFP].ptr;
1307                     else if (r->direction && r->direction != dir)
1308                               r = r->skip[PF_SKIP_DIR].ptr;
1309                     else if (r->af && r->af != af)
1310                               r = r->skip[PF_SKIP_AF].ptr;
1311                     else if (r->proto && r->proto != pd->proto)
1312                               r = r->skip[PF_SKIP_PROTO].ptr;
1313                     else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
1314                         r->src.neg, kif))
1315                               r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1316                     else if (r->src.port_op && !pf_match_port(r->src.port_op,
1317                                   r->src.port[0], r->src.port[1], th->th_sport))
1318                               r = r->skip[PF_SKIP_SRC_PORT].ptr;
1319                     else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
1320                         r->dst.neg, NULL))
1321                               r = r->skip[PF_SKIP_DST_ADDR].ptr;
1322                     else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
1323                                   r->dst.port[0], r->dst.port[1], th->th_dport))
1324                               r = r->skip[PF_SKIP_DST_PORT].ptr;
1325                     else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
1326                                   pf_osfp_fingerprint(pd, m, off, th),
1327                                   r->os_fingerprint))
1328                               r = TAILQ_NEXT(r, entries);
1329                     else {
1330                               rm = r;
1331                               break;
1332                     }
1333           }
1334 
1335           if (rm == NULL || rm->action == PF_NOSCRUB)
1336                     return (PF_PASS);
1337           else {
1338                     r->packets[dir == PF_OUT]++;
1339                     r->bytes[dir == PF_OUT] += pd->tot_len;
1340           }
1341 
1342           if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
1343                     pd->flags |= PFDESC_TCP_NORM;
1344 
1345           flags = th->th_flags;
1346           if (flags & TH_SYN) {
1347                     /* Illegal packet */
1348                     if (flags & TH_RST)
1349                               goto tcp_drop;
1350 
1351                     if (flags & TH_FIN)
1352                               flags &= ~TH_FIN;
1353           } else {
1354                     /* Illegal packet */
1355                     if (!(flags & (TH_ACK|TH_RST)))
1356                               goto tcp_drop;
1357           }
1358 
1359           if (!(flags & TH_ACK)) {
1360                     /* These flags are only valid if ACK is set */
1361                     if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
1362                               goto tcp_drop;
1363           }
1364 
1365           /* Check for illegal header length */
1366           if (th->th_off < (sizeof(struct tcphdr) >> 2))
1367                     goto tcp_drop;
1368 
1369           /* If flags changed, or reserved data set, then adjust */
1370           if (flags != th->th_flags || th->th_x2 != 0) {
1371                     u_int16_t ov, nv;
1372 
1373                     ov = *(u_int16_t *)(&th->th_ack + 1);
1374                     th->th_flags = flags;
1375                     th->th_x2 = 0;
1376                     nv = *(u_int16_t *)(&th->th_ack + 1);
1377 
1378                     th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0);
1379                     rewrite = 1;
1380           }
1381 
1382           /* Remove urgent pointer, if TH_URG is not set */
1383           if (!(flags & TH_URG) && th->th_urp) {
1384                     th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0);
1385                     th->th_urp = 0;
1386                     rewrite = 1;
1387           }
1388 
1389           /* Process options */
1390           if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af))
1391                     rewrite = 1;
1392 
1393           /* copy back packet headers if we sanitized */
1394           if (rewrite)
1395                     m_copyback(m, off, sizeof(*th), th);
1396 
1397           return (PF_PASS);
1398 
1399  tcp_drop:
1400           REASON_SET(&reason, PFRES_NORM);
1401           if (rm != NULL && r->log)
1402                     PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, r, NULL, NULL, pd);
1403           return (PF_DROP);
1404 }
1405 
1406 int
pf_normalize_tcp_init(struct mbuf * m,int off,struct pf_pdesc * pd,struct tcphdr * th,struct pf_state_peer * src,struct pf_state_peer * dst)1407 pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
1408     struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
1409 {
1410           u_int32_t tsval, tsecr;
1411           u_int8_t hdr[60];
1412           u_int8_t *opt;
1413 
1414           KASSERT((src->scrub == NULL),
1415               ("pf_normalize_tcp_init: src->scrub != NULL"));
1416 
1417           src->scrub = kmalloc(sizeof(struct pf_state_scrub), M_PFSTATESCRUBPL,
1418               M_NOWAIT | M_ZERO);
1419           if (src->scrub == NULL)
1420                     return (1);
1421 
1422           switch (pd->af) {
1423 #ifdef INET
1424           case AF_INET: {
1425                     struct ip *h = mtod(m, struct ip *);
1426                     src->scrub->pfss_ttl = h->ip_ttl;
1427                     break;
1428           }
1429 #endif /* INET */
1430 #ifdef INET6
1431           case AF_INET6: {
1432                     struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1433                     src->scrub->pfss_ttl = h->ip6_hlim;
1434                     break;
1435           }
1436 #endif /* INET6 */
1437           }
1438 
1439 
1440           /*
1441            * All normalizations below are only begun if we see the start of
1442            * the connections.  They must all set an enabled bit in pfss_flags
1443            */
1444           if ((th->th_flags & TH_SYN) == 0)
1445                     return (0);
1446 
1447 
1448           if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
1449               pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1450                     /* Diddle with TCP options */
1451                     int hlen;
1452                     opt = hdr + sizeof(struct tcphdr);
1453                     hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1454                     while (hlen >= TCPOLEN_TIMESTAMP) {
1455                               switch (*opt) {
1456                               case TCPOPT_EOL:    /* FALLTHROUGH */
1457                               case TCPOPT_NOP:
1458                                         opt++;
1459                                         hlen--;
1460                                         break;
1461                               case TCPOPT_TIMESTAMP:
1462                                         if (opt[1] >= TCPOLEN_TIMESTAMP) {
1463                                                   src->scrub->pfss_flags |=
1464                                                       PFSS_TIMESTAMP;
1465                                                   src->scrub->pfss_ts_mod = karc4random();
1466 
1467                                                   /* note PFSS_PAWS not set yet */
1468                                                   memcpy(&tsval, &opt[2],
1469                                                       sizeof(u_int32_t));
1470                                                   memcpy(&tsecr, &opt[6],
1471                                                       sizeof(u_int32_t));
1472                                                   src->scrub->pfss_tsval0 = ntohl(tsval);
1473                                                   src->scrub->pfss_tsval = ntohl(tsval);
1474                                                   src->scrub->pfss_tsecr = ntohl(tsecr);
1475                                                   getmicrouptime(&src->scrub->pfss_last);
1476                                         }
1477                                         /* FALLTHROUGH */
1478                               default:
1479                                         hlen -= MAX(opt[1], 2);
1480                                         opt += MAX(opt[1], 2);
1481                                         break;
1482                               }
1483                     }
1484           }
1485 
1486           return (0);
1487 }
1488 
1489 void
pf_normalize_tcp_cleanup(struct pf_state * state)1490 pf_normalize_tcp_cleanup(struct pf_state *state)
1491 {
1492           if (state->src.scrub)
1493                     kfree(state->src.scrub, M_PFSTATESCRUBPL);
1494           if (state->dst.scrub)
1495                     kfree(state->dst.scrub, M_PFSTATESCRUBPL);
1496 
1497           /* Someday... flush the TCP segment reassembly descriptors. */
1498 }
1499 
1500 int
pf_normalize_tcp_stateful(struct mbuf * m,int off,struct pf_pdesc * pd,u_short * reason,struct tcphdr * th,struct pf_state * state,struct pf_state_peer * src,struct pf_state_peer * dst,int * writeback)1501 pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
1502     u_short *reason, struct tcphdr *th, struct pf_state *state,
1503     struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
1504 {
1505           struct timeval uptime;
1506           u_int32_t tsval, tsecr;
1507           u_int tsval_from_last;
1508           u_int8_t hdr[60];
1509           u_int8_t *opt;
1510           int copyback = 0;
1511           int got_ts = 0;
1512 
1513           KASSERT((src->scrub || dst->scrub),
1514               ("pf_normalize_tcp_statefull: src->scrub && dst->scrub!"));
1515 
1516           tsval = 0;          /* avoid gcc complaint */
1517           tsecr = 0;          /* avoid gcc complaint */
1518 
1519           /*
1520            * Enforce the minimum TTL seen for this connection.  Negate a common
1521            * technique to evade an intrusion detection system and confuse
1522            * firewall state code.
1523            */
1524           switch (pd->af) {
1525 #ifdef INET
1526           case AF_INET: {
1527                     if (src->scrub) {
1528                               struct ip *h = mtod(m, struct ip *);
1529                               if (h->ip_ttl > src->scrub->pfss_ttl)
1530                                         src->scrub->pfss_ttl = h->ip_ttl;
1531                               h->ip_ttl = src->scrub->pfss_ttl;
1532                     }
1533                     break;
1534           }
1535 #endif /* INET */
1536 #ifdef INET6
1537           case AF_INET6: {
1538                     if (src->scrub) {
1539                               struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1540                               if (h->ip6_hlim > src->scrub->pfss_ttl)
1541                                         src->scrub->pfss_ttl = h->ip6_hlim;
1542                               h->ip6_hlim = src->scrub->pfss_ttl;
1543                     }
1544                     break;
1545           }
1546 #endif /* INET6 */
1547           }
1548 
1549           if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
1550               ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
1551               (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
1552               pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1553                     /* Diddle with TCP options */
1554                     int hlen;
1555                     opt = hdr + sizeof(struct tcphdr);
1556                     hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1557                     while (hlen >= TCPOLEN_TIMESTAMP) {
1558                               switch (*opt) {
1559                               case TCPOPT_EOL:    /* FALLTHROUGH */
1560                               case TCPOPT_NOP:
1561                                         opt++;
1562                                         hlen--;
1563                                         break;
1564                               case TCPOPT_TIMESTAMP:
1565                                         /* Modulate the timestamps.  Can be used for
1566                                          * NAT detection, OS uptime determination or
1567                                          * reboot detection.
1568                                          */
1569 
1570                                         if (got_ts) {
1571                                                   /* Huh?  Multiple timestamps!? */
1572                                                   if (pf_status.debug >= PF_DEBUG_MISC) {
1573                                                             DPFPRINTF(("multiple TS??"));
1574                                                             pf_print_state(state);
1575                                                             kprintf("\n");
1576                                                   }
1577                                                   REASON_SET(reason, PFRES_TS);
1578                                                   return (PF_DROP);
1579                                         }
1580                                         if (opt[1] >= TCPOLEN_TIMESTAMP) {
1581                                                   memcpy(&tsval, &opt[2],
1582                                                       sizeof(u_int32_t));
1583                                                   if (tsval && src->scrub &&
1584                                                       (src->scrub->pfss_flags &
1585                                                       PFSS_TIMESTAMP)) {
1586                                                             tsval = ntohl(tsval);
1587                                                             pf_change_a(&opt[2],
1588                                                                 &th->th_sum,
1589                                                                 htonl(tsval +
1590                                                                 src->scrub->pfss_ts_mod),
1591                                                                 0);
1592                                                             copyback = 1;
1593                                                   }
1594 
1595                                                   /* Modulate TS reply iff valid (!0) */
1596                                                   memcpy(&tsecr, &opt[6],
1597                                                       sizeof(u_int32_t));
1598                                                   if (tsecr && dst->scrub &&
1599                                                       (dst->scrub->pfss_flags &
1600                                                       PFSS_TIMESTAMP)) {
1601                                                             tsecr = ntohl(tsecr)
1602                                                                 - dst->scrub->pfss_ts_mod;
1603                                                             pf_change_a(&opt[6],
1604                                                                 &th->th_sum, htonl(tsecr),
1605                                                                 0);
1606                                                             copyback = 1;
1607                                                   }
1608                                                   got_ts = 1;
1609                                         }
1610                                         /* FALLTHROUGH */
1611                               default:
1612                                         hlen -= MAX(opt[1], 2);
1613                                         opt += MAX(opt[1], 2);
1614                                         break;
1615                               }
1616                     }
1617                     if (copyback) {
1618                               /* Copyback the options, caller copys back header */
1619                               *writeback = 1;
1620                               m_copyback(m, off + sizeof(struct tcphdr),
1621                                   (th->th_off << 2) - sizeof(struct tcphdr),
1622                                   hdr + sizeof(struct tcphdr));
1623                     }
1624           }
1625 
1626 
1627           /*
1628            * Must invalidate PAWS checks on connections idle for too long.
1629            * The fastest allowed timestamp clock is 1ms.  That turns out to
1630            * be about 24 days before it wraps.  XXX Right now our lowerbound
1631            * TS echo check only works for the first 12 days of a connection
1632            * when the TS has exhausted half its 32bit space
1633            */
1634 #define TS_MAX_IDLE (24*24*60*60)
1635 #define TS_MAX_CONN (12*24*60*60)       /* XXX remove when better tsecr check */
1636 
1637           getmicrouptime(&uptime);
1638           if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
1639               (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
1640               time_second - state->creation > TS_MAX_CONN))  {
1641                     if (pf_status.debug >= PF_DEBUG_MISC) {
1642                               DPFPRINTF(("src idled out of PAWS\n"));
1643                               pf_print_state(state);
1644                               kprintf("\n");
1645                     }
1646                     src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
1647                         | PFSS_PAWS_IDLED;
1648           }
1649           if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
1650               uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
1651                     if (pf_status.debug >= PF_DEBUG_MISC) {
1652                               DPFPRINTF(("dst idled out of PAWS\n"));
1653                               pf_print_state(state);
1654                               kprintf("\n");
1655                     }
1656                     dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
1657                         | PFSS_PAWS_IDLED;
1658           }
1659 
1660           if (got_ts && src->scrub && dst->scrub &&
1661               (src->scrub->pfss_flags & PFSS_PAWS) &&
1662               (dst->scrub->pfss_flags & PFSS_PAWS)) {
1663                     /* Validate that the timestamps are "in-window".
1664                      * RFC1323 describes TCP Timestamp options that allow
1665                      * measurement of RTT (round trip time) and PAWS
1666                      * (protection against wrapped sequence numbers).  PAWS
1667                      * gives us a set of rules for rejecting packets on
1668                      * long fat pipes (packets that were somehow delayed
1669                      * in transit longer than the time it took to send the
1670                      * full TCP sequence space of 4Gb).  We can use these
1671                      * rules and infer a few others that will let us treat
1672                      * the 32bit timestamp and the 32bit echoed timestamp
1673                      * as sequence numbers to prevent a blind attacker from
1674                      * inserting packets into a connection.
1675                      *
1676                      * RFC1323 tells us:
1677                      *  - The timestamp on this packet must be greater than
1678                      *    or equal to the last value echoed by the other
1679                      *    endpoint.  The RFC says those will be discarded
1680                      *    since it is a dup that has already been acked.
1681                      *    This gives us a lowerbound on the timestamp.
1682                      *        timestamp >= other last echoed timestamp
1683                      *  - The timestamp will be less than or equal to
1684                      *    the last timestamp plus the time between the
1685                      *    last packet and now.  The RFC defines the max
1686                      *    clock rate as 1ms.  We will allow clocks to be
1687                      *    up to 10% fast and will allow a total difference
1688                      *    or 30 seconds due to a route change.  And this
1689                      *    gives us an upperbound on the timestamp.
1690                      *        timestamp <= last timestamp + max ticks
1691                      *    We have to be careful here.  Windows will send an
1692                      *    initial timestamp of zero and then initialize it
1693                      *    to a random value after the 3whs; presumably to
1694                      *    avoid a DoS by having to call an expensive RNG
1695                      *    during a SYN flood.  Proof MS has at least one
1696                      *    good security geek.
1697                      *
1698                      *  - The TCP timestamp option must also echo the other
1699                      *    endpoints timestamp.  The timestamp echoed is the
1700                      *    one carried on the earliest unacknowledged segment
1701                      *    on the left edge of the sequence window.  The RFC
1702                      *    states that the host will reject any echoed
1703                      *    timestamps that were larger than any ever sent.
1704                      *    This gives us an upperbound on the TS echo.
1705                      *        tescr <= largest_tsval
1706                      *  - The lowerbound on the TS echo is a little more
1707                      *    tricky to determine.  The other endpoint's echoed
1708                      *    values will not decrease.  But there may be
1709                      *    network conditions that re-order packets and
1710                      *    cause our view of them to decrease.  For now the
1711                      *    only lowerbound we can safely determine is that
1712                      *    the TS echo will never be less than the original
1713                      *    TS.  XXX There is probably a better lowerbound.
1714                      *    Remove TS_MAX_CONN with better lowerbound check.
1715                      *        tescr >= other original TS
1716                      *
1717                      * It is also important to note that the fastest
1718                      * timestamp clock of 1ms will wrap its 32bit space in
1719                      * 24 days.  So we just disable TS checking after 24
1720                      * days of idle time.  We actually must use a 12d
1721                      * connection limit until we can come up with a better
1722                      * lowerbound to the TS echo check.
1723                      */
1724                     struct timeval delta_ts;
1725                     int ts_fudge;
1726 
1727 
1728                     /*
1729                      * PFTM_TS_DIFF is how many seconds of leeway to allow
1730                      * a host's timestamp.  This can happen if the previous
1731                      * packet got delayed in transit for much longer than
1732                      * this packet.
1733                      */
1734                     if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
1735                               ts_fudge = pf_default_rule.timeout[PFTM_TS_DIFF];
1736 
1737 
1738                     /* Calculate max ticks since the last timestamp */
1739 #define TS_MAXFREQ  1100                /* RFC max TS freq of 1Khz + 10% skew */
1740 #define TS_MICROSECS          1000000             /* microseconds per second */
1741 #ifndef timersub
1742 #define timersub(tvp, uvp, vvp)                                                           \
1743           do {                                                                            \
1744                     (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;              \
1745                     (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
1746                     if ((vvp)->tv_usec < 0) {                                   \
1747                               (vvp)->tv_sec--;                                  \
1748                               (vvp)->tv_usec += 1000000;                        \
1749                     }                                                                     \
1750           } while (0)
1751 #endif
1752 
1753                     timersub(&uptime, &src->scrub->pfss_last, &delta_ts);
1754                     tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
1755                     tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
1756 
1757 
1758                     if ((src->state >= TCPS_ESTABLISHED &&
1759                         dst->state >= TCPS_ESTABLISHED) &&
1760                         (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
1761                         SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
1762                         (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
1763                         SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
1764                               /* Bad RFC1323 implementation or an insertion attack.
1765                                *
1766                                * - Solaris 2.6 and 2.7 are known to send another ACK
1767                                *   after the FIN,FIN|ACK,ACK closing that carries
1768                                *   an old timestamp.
1769                                */
1770 
1771                               DPFPRINTF(("Timestamp failed %c%c%c%c\n",
1772                                   SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
1773                                   SEQ_GT(tsval, src->scrub->pfss_tsval +
1774                                   tsval_from_last) ? '1' : ' ',
1775                                   SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
1776                                   SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
1777                               DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
1778                                   "idle: %lus %lums\n",
1779                                   tsval, tsecr, tsval_from_last, delta_ts.tv_sec,
1780                                   delta_ts.tv_usec / 1000));
1781                               DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
1782                                   src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
1783                               DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u"
1784                                   "\n", dst->scrub->pfss_tsval,
1785                                   dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
1786                               if (pf_status.debug >= PF_DEBUG_MISC) {
1787                                         pf_print_state(state);
1788                                         pf_print_flags(th->th_flags);
1789                                         kprintf("\n");
1790                               }
1791                               REASON_SET(reason, PFRES_TS);
1792                               return (PF_DROP);
1793                     }
1794 
1795                     /* XXX I'd really like to require tsecr but it's optional */
1796 
1797           } else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
1798               ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
1799               || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
1800               src->scrub && dst->scrub &&
1801               (src->scrub->pfss_flags & PFSS_PAWS) &&
1802               (dst->scrub->pfss_flags & PFSS_PAWS)) {
1803                     /* Didn't send a timestamp.  Timestamps aren't really useful
1804                      * when:
1805                      *  - connection opening or closing (often not even sent).
1806                      *    but we must not let an attacker to put a FIN on a
1807                      *    data packet to sneak it through our ESTABLISHED check.
1808                      *  - on a TCP reset.  RFC suggests not even looking at TS.
1809                      *  - on an empty ACK.  The TS will not be echoed so it will
1810                      *    probably not help keep the RTT calculation in sync and
1811                      *    there isn't as much danger when the sequence numbers
1812                      *    got wrapped.  So some stacks don't include TS on empty
1813                      *    ACKs :-(
1814                      *
1815                      * To minimize the disruption to mostly RFC1323 conformant
1816                      * stacks, we will only require timestamps on data packets.
1817                      *
1818                      * And what do ya know, we cannot require timestamps on data
1819                      * packets.  There appear to be devices that do legitimate
1820                      * TCP connection hijacking.  There are HTTP devices that allow
1821                      * a 3whs (with timestamps) and then buffer the HTTP request.
1822                      * If the intermediate device has the HTTP response cache, it
1823                      * will spoof the response but not bother timestamping its
1824                      * packets.  So we can look for the presence of a timestamp in
1825                      * the first data packet and if there, require it in all future
1826                      * packets.
1827                      */
1828 
1829                     if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
1830                               /*
1831                                * Hey!  Someone tried to sneak a packet in.  Or the
1832                                * stack changed its RFC1323 behavior?!?!
1833                                */
1834                               if (pf_status.debug >= PF_DEBUG_MISC) {
1835                                         DPFPRINTF(("Did not receive expected RFC1323 "
1836                                             "timestamp\n"));
1837                                         pf_print_state(state);
1838                                         pf_print_flags(th->th_flags);
1839                                         kprintf("\n");
1840                               }
1841                               REASON_SET(reason, PFRES_TS);
1842                               return (PF_DROP);
1843                     }
1844           }
1845 
1846 
1847           /*
1848            * We will note if a host sends his data packets with or without
1849            * timestamps.  And require all data packets to contain a timestamp
1850            * if the first does.  PAWS implicitly requires that all data packets be
1851            * timestamped.  But I think there are middle-man devices that hijack
1852            * TCP streams immediately after the 3whs and don't timestamp their
1853            * packets (seen in a WWW accelerator or cache).
1854            */
1855           if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
1856               (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
1857                     if (got_ts)
1858                               src->scrub->pfss_flags |= PFSS_DATA_TS;
1859                     else {
1860                               src->scrub->pfss_flags |= PFSS_DATA_NOTS;
1861                               if (pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
1862                                   (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
1863                                         /* Don't warn if other host rejected RFC1323 */
1864                                         DPFPRINTF(("Broken RFC1323 stack did not "
1865                                             "timestamp data packet. Disabled PAWS "
1866                                             "security.\n"));
1867                                         pf_print_state(state);
1868                                         pf_print_flags(th->th_flags);
1869                                         kprintf("\n");
1870                               }
1871                     }
1872           }
1873 
1874 
1875           /*
1876            * Update PAWS values
1877            */
1878           if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
1879               (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
1880                     getmicrouptime(&src->scrub->pfss_last);
1881                     if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
1882                         (src->scrub->pfss_flags & PFSS_PAWS) == 0)
1883                               src->scrub->pfss_tsval = tsval;
1884 
1885                     if (tsecr) {
1886                               if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
1887                                   (src->scrub->pfss_flags & PFSS_PAWS) == 0)
1888                                         src->scrub->pfss_tsecr = tsecr;
1889 
1890                               if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
1891                                   (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
1892                                   src->scrub->pfss_tsval0 == 0)) {
1893                                         /* tsval0 MUST be the lowest timestamp */
1894                                         src->scrub->pfss_tsval0 = tsval;
1895                               }
1896 
1897                               /* Only fully initialized after a TS gets echoed */
1898                               if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
1899                                         src->scrub->pfss_flags |= PFSS_PAWS;
1900                     }
1901           }
1902 
1903           /* I have a dream....  TCP segment reassembly.... */
1904           return (0);
1905 }
1906 
1907 int
pf_normalize_tcpopt(struct pf_rule * r,struct mbuf * m,struct tcphdr * th,int off,sa_family_t af)1908 pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
1909     int off, sa_family_t af)
1910 {
1911           u_int16_t *mss;
1912           int                  thoff;
1913           int                  opt, cnt, optlen = 0;
1914           int                  rewrite = 0;
1915           u_char               opts[TCP_MAXOLEN];
1916           u_char              *optp = opts;
1917 
1918           thoff = th->th_off << 2;
1919           cnt = thoff - sizeof(struct tcphdr);
1920 
1921           if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt,
1922               NULL, NULL, af))
1923                     return (rewrite);
1924 
1925           for (; cnt > 0; cnt -= optlen, optp += optlen) {
1926                     opt = optp[0];
1927                     if (opt == TCPOPT_EOL)
1928                               break;
1929                     if (opt == TCPOPT_NOP)
1930                               optlen = 1;
1931                     else {
1932                               if (cnt < 2)
1933                                         break;
1934                               optlen = optp[1];
1935                               if (optlen < 2 || optlen > cnt)
1936                                         break;
1937                     }
1938                     switch (opt) {
1939                     case TCPOPT_MAXSEG:
1940                               mss = (u_int16_t *)(optp + 2);
1941                               if ((ntohs(*mss)) > r->max_mss) {
1942                                         th->th_sum = pf_cksum_fixup(th->th_sum,
1943                                             *mss, htons(r->max_mss), 0);
1944                                         *mss = htons(r->max_mss);
1945                                         rewrite = 1;
1946                               }
1947                               break;
1948                     default:
1949                               break;
1950                     }
1951           }
1952 
1953           if (rewrite)
1954                     m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts);
1955 
1956           return (rewrite);
1957 }
1958