xref: /dragonfly/contrib/libpcap/optimize.c (revision e75ef36f1332e115895388cede9dfd24ca1a806c)
1 /*
2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
3  *        The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that: (1) source code distributions
7  * retain the above copyright notice and this paragraph in its entirety, (2)
8  * distributions including binary code include the above copyright notice and
9  * this paragraph in its entirety in the documentation or other materials
10  * provided with the distribution, and (3) all advertising materials mentioning
11  * features or use of this software display the following acknowledgement:
12  * ``This product includes software developed by the University of California,
13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
14  * the University nor the names of its contributors may be used to endorse
15  * or promote products derived from this software without specific prior
16  * written permission.
17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
20  *
21  *  Optimization module for BPF code intermediate representation.
22  */
23 
24 #ifdef HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 
28 #include <pcap-types.h>
29 
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <memory.h>
33 #include <setjmp.h>
34 #include <string.h>
35 
36 #include <errno.h>
37 
38 #include "pcap-int.h"
39 
40 #include "gencode.h"
41 #include "optimize.h"
42 
43 #ifdef HAVE_OS_PROTO_H
44 #include "os-proto.h"
45 #endif
46 
47 #ifdef BDEBUG
48 /*
49  * The internal "debug printout" flag for the filter expression optimizer.
50  * The code to print that stuff is present only if BDEBUG is defined, so
51  * the flag, and the routine to set it, are defined only if BDEBUG is
52  * defined.
53  */
54 static int pcap_optimizer_debug;
55 
56 /*
57  * Routine to set that flag.
58  *
59  * This is intended for libpcap developers, not for general use.
60  * If you want to set these in a program, you'll have to declare this
61  * routine yourself, with the appropriate DLL import attribute on Windows;
62  * it's not declared in any header file, and won't be declared in any
63  * header file provided by libpcap.
64  */
65 PCAP_API void pcap_set_optimizer_debug(int value);
66 
67 PCAP_API_DEF void
pcap_set_optimizer_debug(int value)68 pcap_set_optimizer_debug(int value)
69 {
70           pcap_optimizer_debug = value;
71 }
72 
73 /*
74  * The internal "print dot graph" flag for the filter expression optimizer.
75  * The code to print that stuff is present only if BDEBUG is defined, so
76  * the flag, and the routine to set it, are defined only if BDEBUG is
77  * defined.
78  */
79 static int pcap_print_dot_graph;
80 
81 /*
82  * Routine to set that flag.
83  *
84  * This is intended for libpcap developers, not for general use.
85  * If you want to set these in a program, you'll have to declare this
86  * routine yourself, with the appropriate DLL import attribute on Windows;
87  * it's not declared in any header file, and won't be declared in any
88  * header file provided by libpcap.
89  */
90 PCAP_API void pcap_set_print_dot_graph(int value);
91 
92 PCAP_API_DEF void
pcap_set_print_dot_graph(int value)93 pcap_set_print_dot_graph(int value)
94 {
95           pcap_print_dot_graph = value;
96 }
97 
98 #endif
99 
100 /*
101  * lowest_set_bit().
102  *
103  * Takes a 32-bit integer as an argument.
104  *
105  * If handed a non-zero value, returns the index of the lowest set bit,
106  * counting upwards from zero.
107  *
108  * If handed zero, the results are platform- and compiler-dependent.
109  * Keep it out of the light, don't give it any water, don't feed it
110  * after midnight, and don't pass zero to it.
111  *
112  * This is the same as the count of trailing zeroes in the word.
113  */
114 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
115   /*
116    * GCC 3.4 and later; we have __builtin_ctz().
117    */
118   #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
119 #elif defined(_MSC_VER)
120   /*
121    * Visual Studio; we support only 2005 and later, so use
122    * _BitScanForward().
123    */
124 #include <intrin.h>
125 
126 #ifndef __clang__
127 #pragma intrinsic(_BitScanForward)
128 #endif
129 
130 static __forceinline u_int
lowest_set_bit(int mask)131 lowest_set_bit(int mask)
132 {
133           unsigned long bit;
134 
135           /*
136            * Don't sign-extend mask if long is longer than int.
137            * (It's currently not, in MSVC, even on 64-bit platforms, but....)
138            */
139           if (_BitScanForward(&bit, (unsigned int)mask) == 0)
140                     abort();  /* mask is zero */
141           return (u_int)bit;
142 }
143 #elif defined(MSDOS) && defined(__DJGPP__)
144   /*
145    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
146    * we've already included.
147    */
148   #define lowest_set_bit(mask)          ((u_int)(ffs((mask)) - 1))
149 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
150   /*
151    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
152    * or some other platform (UN*X conforming to a sufficient recent version
153    * of the Single UNIX Specification).
154    */
155   #include <strings.h>
156   #define lowest_set_bit(mask)          (u_int)((ffs((mask)) - 1))
157 #else
158 /*
159  * None of the above.
160  * Use a perfect-hash-function-based function.
161  */
162 static u_int
lowest_set_bit(int mask)163 lowest_set_bit(int mask)
164 {
165           unsigned int v = (unsigned int)mask;
166 
167           static const u_int MultiplyDeBruijnBitPosition[32] = {
168                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
169                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
170           };
171 
172           /*
173            * We strip off all but the lowermost set bit (v & ~v),
174            * and perform a minimal perfect hash on it to look up the
175            * number of low-order zero bits in a table.
176            *
177            * See:
178            *
179            *        http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
180            *
181            *        http://supertech.csail.mit.edu/papers/debruijn.pdf
182            */
183           return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
184 }
185 #endif
186 
187 /*
188  * Represents a deleted instruction.
189  */
190 #define NOP -1
191 
192 /*
193  * Register numbers for use-def values.
194  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
195  * location.  A_ATOM is the accumulator and X_ATOM is the index
196  * register.
197  */
198 #define A_ATOM BPF_MEMWORDS
199 #define X_ATOM (BPF_MEMWORDS+1)
200 
201 /*
202  * This define is used to represent *both* the accumulator and
203  * x register in use-def computations.
204  * Currently, the use-def code assumes only one definition per instruction.
205  */
206 #define AX_ATOM N_ATOMS
207 
208 /*
209  * These data structures are used in a Cocke and Shwarz style
210  * value numbering scheme.  Since the flowgraph is acyclic,
211  * exit values can be propagated from a node's predecessors
212  * provided it is uniquely defined.
213  */
214 struct valnode {
215           int code;
216           bpf_u_int32 v0, v1;
217           int val;            /* the value number */
218           struct valnode *next;
219 };
220 
221 /* Integer constants mapped with the load immediate opcode. */
222 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
223 
224 struct vmapinfo {
225           int is_const;
226           bpf_u_int32 const_val;
227 };
228 
229 typedef struct {
230           /*
231            * Place to longjmp to on an error.
232            */
233           jmp_buf top_ctx;
234 
235           /*
236            * The buffer into which to put error message.
237            */
238           char *errbuf;
239 
240           /*
241            * A flag to indicate that further optimization is needed.
242            * Iterative passes are continued until a given pass yields no
243            * code simplification or branch movement.
244            */
245           int done;
246 
247           /*
248            * XXX - detect loops that do nothing but repeated AND/OR pullups
249            * and edge moves.
250            * If 100 passes in a row do nothing but that, treat that as a
251            * sign that we're in a loop that just shuffles in a cycle in
252            * which each pass just shuffles the code and we eventually
253            * get back to the original configuration.
254            *
255            * XXX - we need a non-heuristic way of detecting, or preventing,
256            * such a cycle.
257            */
258           int non_branch_movement_performed;
259 
260           u_int n_blocks;               /* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
261           struct block **blocks;
262           u_int n_edges;                /* twice n_blocks, so guaranteed to be > 0 */
263           struct edge **edges;
264 
265           /*
266            * A bit vector set representation of the dominators.
267            * We round up the set size to the next power of two.
268            */
269           u_int nodewords;    /* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
270           u_int edgewords;    /* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
271           struct block **levels;
272           bpf_u_int32 *space;
273 
274 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
275 /*
276  * True if a is in uset {p}
277  */
278 #define SET_MEMBER(p, a) \
279 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
280 
281 /*
282  * Add 'a' to uset p.
283  */
284 #define SET_INSERT(p, a) \
285 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
286 
287 /*
288  * Delete 'a' from uset p.
289  */
290 #define SET_DELETE(p, a) \
291 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
292 
293 /*
294  * a := a intersect b
295  * n must be guaranteed to be > 0
296  */
297 #define SET_INTERSECT(a, b, n)\
298 {\
299           register bpf_u_int32 *_x = a, *_y = b;\
300           register u_int _n = n;\
301           do *_x++ &= *_y++; while (--_n != 0);\
302 }
303 
304 /*
305  * a := a - b
306  * n must be guaranteed to be > 0
307  */
308 #define SET_SUBTRACT(a, b, n)\
309 {\
310           register bpf_u_int32 *_x = a, *_y = b;\
311           register u_int _n = n;\
312           do *_x++ &=~ *_y++; while (--_n != 0);\
313 }
314 
315 /*
316  * a := a union b
317  * n must be guaranteed to be > 0
318  */
319 #define SET_UNION(a, b, n)\
320 {\
321           register bpf_u_int32 *_x = a, *_y = b;\
322           register u_int _n = n;\
323           do *_x++ |= *_y++; while (--_n != 0);\
324 }
325 
326           uset all_dom_sets;
327           uset all_closure_sets;
328           uset all_edge_sets;
329 
330 #define MODULUS 213
331           struct valnode *hashtbl[MODULUS];
332           bpf_u_int32 curval;
333           bpf_u_int32 maxval;
334 
335           struct vmapinfo *vmap;
336           struct valnode *vnode_base;
337           struct valnode *next_vnode;
338 } opt_state_t;
339 
340 typedef struct {
341           /*
342            * Place to longjmp to on an error.
343            */
344           jmp_buf top_ctx;
345 
346           /*
347            * The buffer into which to put error message.
348            */
349           char *errbuf;
350 
351           /*
352            * Some pointers used to convert the basic block form of the code,
353            * into the array form that BPF requires.  'fstart' will point to
354            * the malloc'd array while 'ftail' is used during the recursive
355            * traversal.
356            */
357           struct bpf_insn *fstart;
358           struct bpf_insn *ftail;
359 } conv_state_t;
360 
361 static void opt_init(opt_state_t *, struct icode *);
362 static void opt_cleanup(opt_state_t *);
363 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
364     PCAP_PRINTFLIKE(2, 3);
365 
366 static void intern_blocks(opt_state_t *, struct icode *);
367 
368 static void find_inedges(opt_state_t *, struct block *);
369 #ifdef BDEBUG
370 static void opt_dump(opt_state_t *, struct icode *);
371 #endif
372 
373 #ifndef MAX
374 #define MAX(a,b) ((a)>(b)?(a):(b))
375 #endif
376 
377 static void
find_levels_r(opt_state_t * opt_state,struct icode * ic,struct block * b)378 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
379 {
380           int level;
381 
382           if (isMarked(ic, b))
383                     return;
384 
385           Mark(ic, b);
386           b->link = 0;
387 
388           if (JT(b)) {
389                     find_levels_r(opt_state, ic, JT(b));
390                     find_levels_r(opt_state, ic, JF(b));
391                     level = MAX(JT(b)->level, JF(b)->level) + 1;
392           } else
393                     level = 0;
394           b->level = level;
395           b->link = opt_state->levels[level];
396           opt_state->levels[level] = b;
397 }
398 
399 /*
400  * Level graph.  The levels go from 0 at the leaves to
401  * N_LEVELS at the root.  The opt_state->levels[] array points to the
402  * first node of the level list, whose elements are linked
403  * with the 'link' field of the struct block.
404  */
405 static void
find_levels(opt_state_t * opt_state,struct icode * ic)406 find_levels(opt_state_t *opt_state, struct icode *ic)
407 {
408           memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
409           unMarkAll(ic);
410           find_levels_r(opt_state, ic, ic->root);
411 }
412 
413 /*
414  * Find dominator relationships.
415  * Assumes graph has been leveled.
416  */
417 static void
find_dom(opt_state_t * opt_state,struct block * root)418 find_dom(opt_state_t *opt_state, struct block *root)
419 {
420           u_int i;
421           int level;
422           struct block *b;
423           bpf_u_int32 *x;
424 
425           /*
426            * Initialize sets to contain all nodes.
427            */
428           x = opt_state->all_dom_sets;
429           /*
430            * In opt_init(), we've made sure the product doesn't overflow.
431            */
432           i = opt_state->n_blocks * opt_state->nodewords;
433           while (i != 0) {
434                     --i;
435                     *x++ = 0xFFFFFFFFU;
436           }
437           /* Root starts off empty. */
438           for (i = opt_state->nodewords; i != 0;) {
439                     --i;
440                     root->dom[i] = 0;
441           }
442 
443           /* root->level is the highest level no found. */
444           for (level = root->level; level >= 0; --level) {
445                     for (b = opt_state->levels[level]; b; b = b->link) {
446                               SET_INSERT(b->dom, b->id);
447                               if (JT(b) == 0)
448                                         continue;
449                               SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
450                               SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
451                     }
452           }
453 }
454 
455 static void
propedom(opt_state_t * opt_state,struct edge * ep)456 propedom(opt_state_t *opt_state, struct edge *ep)
457 {
458           SET_INSERT(ep->edom, ep->id);
459           if (ep->succ) {
460                     SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
461                     SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
462           }
463 }
464 
465 /*
466  * Compute edge dominators.
467  * Assumes graph has been leveled and predecessors established.
468  */
469 static void
find_edom(opt_state_t * opt_state,struct block * root)470 find_edom(opt_state_t *opt_state, struct block *root)
471 {
472           u_int i;
473           uset x;
474           int level;
475           struct block *b;
476 
477           x = opt_state->all_edge_sets;
478           /*
479            * In opt_init(), we've made sure the product doesn't overflow.
480            */
481           for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
482                     --i;
483                     x[i] = 0xFFFFFFFFU;
484           }
485 
486           /* root->level is the highest level no found. */
487           memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
488           memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
489           for (level = root->level; level >= 0; --level) {
490                     for (b = opt_state->levels[level]; b != 0; b = b->link) {
491                               propedom(opt_state, &b->et);
492                               propedom(opt_state, &b->ef);
493                     }
494           }
495 }
496 
497 /*
498  * Find the backwards transitive closure of the flow graph.  These sets
499  * are backwards in the sense that we find the set of nodes that reach
500  * a given node, not the set of nodes that can be reached by a node.
501  *
502  * Assumes graph has been leveled.
503  */
504 static void
find_closure(opt_state_t * opt_state,struct block * root)505 find_closure(opt_state_t *opt_state, struct block *root)
506 {
507           int level;
508           struct block *b;
509 
510           /*
511            * Initialize sets to contain no nodes.
512            */
513           memset((char *)opt_state->all_closure_sets, 0,
514                 opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
515 
516           /* root->level is the highest level no found. */
517           for (level = root->level; level >= 0; --level) {
518                     for (b = opt_state->levels[level]; b; b = b->link) {
519                               SET_INSERT(b->closure, b->id);
520                               if (JT(b) == 0)
521                                         continue;
522                               SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
523                               SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
524                     }
525           }
526 }
527 
528 /*
529  * Return the register number that is used by s.
530  *
531  * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
532  * are used, the scratch memory location's number if a scratch memory
533  * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
534  *
535  * The implementation should probably change to an array access.
536  */
537 static int
atomuse(struct stmt * s)538 atomuse(struct stmt *s)
539 {
540           register int c = s->code;
541 
542           if (c == NOP)
543                     return -1;
544 
545           switch (BPF_CLASS(c)) {
546 
547           case BPF_RET:
548                     return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
549                               (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
550 
551           case BPF_LD:
552           case BPF_LDX:
553                     /*
554                      * As there are fewer than 2^31 memory locations,
555                      * s->k should be convertible to int without problems.
556                      */
557                     return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
558                               (BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
559 
560           case BPF_ST:
561                     return A_ATOM;
562 
563           case BPF_STX:
564                     return X_ATOM;
565 
566           case BPF_JMP:
567           case BPF_ALU:
568                     if (BPF_SRC(c) == BPF_X)
569                               return AX_ATOM;
570                     return A_ATOM;
571 
572           case BPF_MISC:
573                     return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
574           }
575           abort();
576           /* NOTREACHED */
577 }
578 
579 /*
580  * Return the register number that is defined by 's'.  We assume that
581  * a single stmt cannot define more than one register.  If no register
582  * is defined, return -1.
583  *
584  * The implementation should probably change to an array access.
585  */
586 static int
atomdef(struct stmt * s)587 atomdef(struct stmt *s)
588 {
589           if (s->code == NOP)
590                     return -1;
591 
592           switch (BPF_CLASS(s->code)) {
593 
594           case BPF_LD:
595           case BPF_ALU:
596                     return A_ATOM;
597 
598           case BPF_LDX:
599                     return X_ATOM;
600 
601           case BPF_ST:
602           case BPF_STX:
603                     return s->k;
604 
605           case BPF_MISC:
606                     return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
607           }
608           return -1;
609 }
610 
611 /*
612  * Compute the sets of registers used, defined, and killed by 'b'.
613  *
614  * "Used" means that a statement in 'b' uses the register before any
615  * statement in 'b' defines it, i.e. it uses the value left in
616  * that register by a predecessor block of this block.
617  * "Defined" means that a statement in 'b' defines it.
618  * "Killed" means that a statement in 'b' defines it before any
619  * statement in 'b' uses it, i.e. it kills the value left in that
620  * register by a predecessor block of this block.
621  */
622 static void
compute_local_ud(struct block * b)623 compute_local_ud(struct block *b)
624 {
625           struct slist *s;
626           atomset def = 0, use = 0, killed = 0;
627           int atom;
628 
629           for (s = b->stmts; s; s = s->next) {
630                     if (s->s.code == NOP)
631                               continue;
632                     atom = atomuse(&s->s);
633                     if (atom >= 0) {
634                               if (atom == AX_ATOM) {
635                                         if (!ATOMELEM(def, X_ATOM))
636                                                   use |= ATOMMASK(X_ATOM);
637                                         if (!ATOMELEM(def, A_ATOM))
638                                                   use |= ATOMMASK(A_ATOM);
639                               }
640                               else if (atom < N_ATOMS) {
641                                         if (!ATOMELEM(def, atom))
642                                                   use |= ATOMMASK(atom);
643                               }
644                               else
645                                         abort();
646                     }
647                     atom = atomdef(&s->s);
648                     if (atom >= 0) {
649                               if (!ATOMELEM(use, atom))
650                                         killed |= ATOMMASK(atom);
651                               def |= ATOMMASK(atom);
652                     }
653           }
654           if (BPF_CLASS(b->s.code) == BPF_JMP) {
655                     /*
656                      * XXX - what about RET?
657                      */
658                     atom = atomuse(&b->s);
659                     if (atom >= 0) {
660                               if (atom == AX_ATOM) {
661                                         if (!ATOMELEM(def, X_ATOM))
662                                                   use |= ATOMMASK(X_ATOM);
663                                         if (!ATOMELEM(def, A_ATOM))
664                                                   use |= ATOMMASK(A_ATOM);
665                               }
666                               else if (atom < N_ATOMS) {
667                                         if (!ATOMELEM(def, atom))
668                                                   use |= ATOMMASK(atom);
669                               }
670                               else
671                                         abort();
672                     }
673           }
674 
675           b->def = def;
676           b->kill = killed;
677           b->in_use = use;
678 }
679 
680 /*
681  * Assume graph is already leveled.
682  */
683 static void
find_ud(opt_state_t * opt_state,struct block * root)684 find_ud(opt_state_t *opt_state, struct block *root)
685 {
686           int i, maxlevel;
687           struct block *p;
688 
689           /*
690            * root->level is the highest level no found;
691            * count down from there.
692            */
693           maxlevel = root->level;
694           for (i = maxlevel; i >= 0; --i)
695                     for (p = opt_state->levels[i]; p; p = p->link) {
696                               compute_local_ud(p);
697                               p->out_use = 0;
698                     }
699 
700           for (i = 1; i <= maxlevel; ++i) {
701                     for (p = opt_state->levels[i]; p; p = p->link) {
702                               p->out_use |= JT(p)->in_use | JF(p)->in_use;
703                               p->in_use |= p->out_use &~ p->kill;
704                     }
705           }
706 }
707 static void
init_val(opt_state_t * opt_state)708 init_val(opt_state_t *opt_state)
709 {
710           opt_state->curval = 0;
711           opt_state->next_vnode = opt_state->vnode_base;
712           memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
713           memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
714 }
715 
716 /*
717  * Because we really don't have an IR, this stuff is a little messy.
718  *
719  * This routine looks in the table of existing value number for a value
720  * with generated from an operation with the specified opcode and
721  * the specified values.  If it finds it, it returns its value number,
722  * otherwise it makes a new entry in the table and returns the
723  * value number of that entry.
724  */
725 static bpf_u_int32
F(opt_state_t * opt_state,int code,bpf_u_int32 v0,bpf_u_int32 v1)726 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
727 {
728           u_int hash;
729           bpf_u_int32 val;
730           struct valnode *p;
731 
732           hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
733           hash %= MODULUS;
734 
735           for (p = opt_state->hashtbl[hash]; p; p = p->next)
736                     if (p->code == code && p->v0 == v0 && p->v1 == v1)
737                               return p->val;
738 
739           /*
740            * Not found.  Allocate a new value, and assign it a new
741            * value number.
742            *
743            * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
744            * increment it before using it as the new value number, which
745            * means we never assign VAL_UNKNOWN.
746            *
747            * XXX - unless we overflow, but we probably won't have 2^32-1
748            * values; we treat 32 bits as effectively infinite.
749            */
750           val = ++opt_state->curval;
751           if (BPF_MODE(code) == BPF_IMM &&
752               (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
753                     opt_state->vmap[val].const_val = v0;
754                     opt_state->vmap[val].is_const = 1;
755           }
756           p = opt_state->next_vnode++;
757           p->val = val;
758           p->code = code;
759           p->v0 = v0;
760           p->v1 = v1;
761           p->next = opt_state->hashtbl[hash];
762           opt_state->hashtbl[hash] = p;
763 
764           return val;
765 }
766 
767 static inline void
vstore(struct stmt * s,bpf_u_int32 * valp,bpf_u_int32 newval,int alter)768 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
769 {
770           if (alter && newval != VAL_UNKNOWN && *valp == newval)
771                     s->code = NOP;
772           else
773                     *valp = newval;
774 }
775 
776 /*
777  * Do constant-folding on binary operators.
778  * (Unary operators are handled elsewhere.)
779  */
780 static void
fold_op(opt_state_t * opt_state,struct stmt * s,bpf_u_int32 v0,bpf_u_int32 v1)781 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
782 {
783           bpf_u_int32 a, b;
784 
785           a = opt_state->vmap[v0].const_val;
786           b = opt_state->vmap[v1].const_val;
787 
788           switch (BPF_OP(s->code)) {
789           case BPF_ADD:
790                     a += b;
791                     break;
792 
793           case BPF_SUB:
794                     a -= b;
795                     break;
796 
797           case BPF_MUL:
798                     a *= b;
799                     break;
800 
801           case BPF_DIV:
802                     if (b == 0)
803                               opt_error(opt_state, "division by zero");
804                     a /= b;
805                     break;
806 
807           case BPF_MOD:
808                     if (b == 0)
809                               opt_error(opt_state, "modulus by zero");
810                     a %= b;
811                     break;
812 
813           case BPF_AND:
814                     a &= b;
815                     break;
816 
817           case BPF_OR:
818                     a |= b;
819                     break;
820 
821           case BPF_XOR:
822                     a ^= b;
823                     break;
824 
825           case BPF_LSH:
826                     /*
827                      * A left shift of more than the width of the type
828                      * is undefined in C; we'll just treat it as shifting
829                      * all the bits out.
830                      *
831                      * XXX - the BPF interpreter doesn't check for this,
832                      * so its behavior is dependent on the behavior of
833                      * the processor on which it's running.  There are
834                      * processors on which it shifts all the bits out
835                      * and processors on which it does no shift.
836                      */
837                     if (b < 32)
838                               a <<= b;
839                     else
840                               a = 0;
841                     break;
842 
843           case BPF_RSH:
844                     /*
845                      * A right shift of more than the width of the type
846                      * is undefined in C; we'll just treat it as shifting
847                      * all the bits out.
848                      *
849                      * XXX - the BPF interpreter doesn't check for this,
850                      * so its behavior is dependent on the behavior of
851                      * the processor on which it's running.  There are
852                      * processors on which it shifts all the bits out
853                      * and processors on which it does no shift.
854                      */
855                     if (b < 32)
856                               a >>= b;
857                     else
858                               a = 0;
859                     break;
860 
861           default:
862                     abort();
863           }
864           s->k = a;
865           s->code = BPF_LD|BPF_IMM;
866           /*
867            * XXX - optimizer loop detection.
868            */
869           opt_state->non_branch_movement_performed = 1;
870           opt_state->done = 0;
871 }
872 
873 static inline struct slist *
this_op(struct slist * s)874 this_op(struct slist *s)
875 {
876           while (s != 0 && s->s.code == NOP)
877                     s = s->next;
878           return s;
879 }
880 
881 static void
opt_not(struct block * b)882 opt_not(struct block *b)
883 {
884           struct block *tmp = JT(b);
885 
886           JT(b) = JF(b);
887           JF(b) = tmp;
888 }
889 
890 static void
opt_peep(opt_state_t * opt_state,struct block * b)891 opt_peep(opt_state_t *opt_state, struct block *b)
892 {
893           struct slist *s;
894           struct slist *next, *last;
895           bpf_u_int32 val;
896 
897           s = b->stmts;
898           if (s == 0)
899                     return;
900 
901           last = s;
902           for (/*empty*/; /*empty*/; s = next) {
903                     /*
904                      * Skip over nops.
905                      */
906                     s = this_op(s);
907                     if (s == 0)
908                               break;    /* nothing left in the block */
909 
910                     /*
911                      * Find the next real instruction after that one
912                      * (skipping nops).
913                      */
914                     next = this_op(s->next);
915                     if (next == 0)
916                               break;    /* no next instruction */
917                     last = next;
918 
919                     /*
920                      * st  M[k]         -->       st  M[k]
921                      * ldx M[k]                   tax
922                      */
923                     if (s->s.code == BPF_ST &&
924                         next->s.code == (BPF_LDX|BPF_MEM) &&
925                         s->s.k == next->s.k) {
926                               /*
927                                * XXX - optimizer loop detection.
928                                */
929                               opt_state->non_branch_movement_performed = 1;
930                               opt_state->done = 0;
931                               next->s.code = BPF_MISC|BPF_TAX;
932                     }
933                     /*
934                      * ld  #k -->       ldx  #k
935                      * tax                        txa
936                      */
937                     if (s->s.code == (BPF_LD|BPF_IMM) &&
938                         next->s.code == (BPF_MISC|BPF_TAX)) {
939                               s->s.code = BPF_LDX|BPF_IMM;
940                               next->s.code = BPF_MISC|BPF_TXA;
941                               /*
942                                * XXX - optimizer loop detection.
943                                */
944                               opt_state->non_branch_movement_performed = 1;
945                               opt_state->done = 0;
946                     }
947                     /*
948                      * This is an ugly special case, but it happens
949                      * when you say tcp[k] or udp[k] where k is a constant.
950                      */
951                     if (s->s.code == (BPF_LD|BPF_IMM)) {
952                               struct slist *add, *tax, *ild;
953 
954                               /*
955                                * Check that X isn't used on exit from this
956                                * block (which the optimizer might cause).
957                                * We know the code generator won't generate
958                                * any local dependencies.
959                                */
960                               if (ATOMELEM(b->out_use, X_ATOM))
961                                         continue;
962 
963                               /*
964                                * Check that the instruction following the ldi
965                                * is an addx, or it's an ldxms with an addx
966                                * following it (with 0 or more nops between the
967                                * ldxms and addx).
968                                */
969                               if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
970                                         add = next;
971                               else
972                                         add = this_op(next->next);
973                               if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
974                                         continue;
975 
976                               /*
977                                * Check that a tax follows that (with 0 or more
978                                * nops between them).
979                                */
980                               tax = this_op(add->next);
981                               if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
982                                         continue;
983 
984                               /*
985                                * Check that an ild follows that (with 0 or more
986                                * nops between them).
987                                */
988                               ild = this_op(tax->next);
989                               if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
990                                   BPF_MODE(ild->s.code) != BPF_IND)
991                                         continue;
992                               /*
993                                * We want to turn this sequence:
994                                *
995                                * (004) ldi     #0x2                   {s}
996                                * (005) ldxms   [14]                   {next}  -- optional
997                                * (006) addx                           {add}
998                                * (007) tax                            {tax}
999                                * (008) ild     [x+0]                  {ild}
1000                                *
1001                                * into this sequence:
1002                                *
1003                                * (004) nop
1004                                * (005) ldxms   [14]
1005                                * (006) nop
1006                                * (007) nop
1007                                * (008) ild     [x+2]
1008                                *
1009                                * XXX We need to check that X is not
1010                                * subsequently used, because we want to change
1011                                * what'll be in it after this sequence.
1012                                *
1013                                * We know we can eliminate the accumulator
1014                                * modifications earlier in the sequence since
1015                                * it is defined by the last stmt of this sequence
1016                                * (i.e., the last statement of the sequence loads
1017                                * a value into the accumulator, so we can eliminate
1018                                * earlier operations on the accumulator).
1019                                */
1020                               ild->s.k += s->s.k;
1021                               s->s.code = NOP;
1022                               add->s.code = NOP;
1023                               tax->s.code = NOP;
1024                               /*
1025                                * XXX - optimizer loop detection.
1026                                */
1027                               opt_state->non_branch_movement_performed = 1;
1028                               opt_state->done = 0;
1029                     }
1030           }
1031           /*
1032            * If the comparison at the end of a block is an equality
1033            * comparison against a constant, and nobody uses the value
1034            * we leave in the A register at the end of a block, and
1035            * the operation preceding the comparison is an arithmetic
1036            * operation, we can sometime optimize it away.
1037            */
1038           if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
1039               !ATOMELEM(b->out_use, A_ATOM)) {
1040                     /*
1041                      * We can optimize away certain subtractions of the
1042                      * X register.
1043                      */
1044                     if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
1045                               val = b->val[X_ATOM];
1046                               if (opt_state->vmap[val].is_const) {
1047                                         /*
1048                                          * If we have a subtract to do a comparison,
1049                                          * and the X register is a known constant,
1050                                          * we can merge this value into the
1051                                          * comparison:
1052                                          *
1053                                          * sub x  ->        nop
1054                                          * jeq #y jeq #(x+y)
1055                                          */
1056                                         b->s.k += opt_state->vmap[val].const_val;
1057                                         last->s.code = NOP;
1058                                         /*
1059                                          * XXX - optimizer loop detection.
1060                                          */
1061                                         opt_state->non_branch_movement_performed = 1;
1062                                         opt_state->done = 0;
1063                               } else if (b->s.k == 0) {
1064                                         /*
1065                                          * If the X register isn't a constant,
1066                                          * and the comparison in the test is
1067                                          * against 0, we can compare with the
1068                                          * X register, instead:
1069                                          *
1070                                          * sub x  ->        nop
1071                                          * jeq #0 jeq x
1072                                          */
1073                                         last->s.code = NOP;
1074                                         b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
1075                                         /*
1076                                          * XXX - optimizer loop detection.
1077                                          */
1078                                         opt_state->non_branch_movement_performed = 1;
1079                                         opt_state->done = 0;
1080                               }
1081                     }
1082                     /*
1083                      * Likewise, a constant subtract can be simplified:
1084                      *
1085                      * sub #x ->        nop
1086                      * jeq #y ->        jeq #(x+y)
1087                      */
1088                     else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1089                               last->s.code = NOP;
1090                               b->s.k += last->s.k;
1091                               /*
1092                                * XXX - optimizer loop detection.
1093                                */
1094                               opt_state->non_branch_movement_performed = 1;
1095                               opt_state->done = 0;
1096                     }
1097                     /*
1098                      * And, similarly, a constant AND can be simplified
1099                      * if we're testing against 0, i.e.:
1100                      *
1101                      * and #k nop
1102                      * jeq #0  ->       jset #k
1103                      */
1104                     else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1105                         b->s.k == 0) {
1106                               b->s.k = last->s.k;
1107                               b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1108                               last->s.code = NOP;
1109                               /*
1110                                * XXX - optimizer loop detection.
1111                                */
1112                               opt_state->non_branch_movement_performed = 1;
1113                               opt_state->done = 0;
1114                               opt_not(b);
1115                     }
1116           }
1117           /*
1118            * jset #0        ->   never
1119            * jset #ffffffff ->   always
1120            */
1121           if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1122                     if (b->s.k == 0)
1123                               JT(b) = JF(b);
1124                     if (b->s.k == 0xffffffffU)
1125                               JF(b) = JT(b);
1126           }
1127           /*
1128            * If we're comparing against the index register, and the index
1129            * register is a known constant, we can just compare against that
1130            * constant.
1131            */
1132           val = b->val[X_ATOM];
1133           if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1134                     bpf_u_int32 v = opt_state->vmap[val].const_val;
1135                     b->s.code &= ~BPF_X;
1136                     b->s.k = v;
1137           }
1138           /*
1139            * If the accumulator is a known constant, we can compute the
1140            * comparison result.
1141            */
1142           val = b->val[A_ATOM];
1143           if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1144                     bpf_u_int32 v = opt_state->vmap[val].const_val;
1145                     switch (BPF_OP(b->s.code)) {
1146 
1147                     case BPF_JEQ:
1148                               v = v == b->s.k;
1149                               break;
1150 
1151                     case BPF_JGT:
1152                               v = v > b->s.k;
1153                               break;
1154 
1155                     case BPF_JGE:
1156                               v = v >= b->s.k;
1157                               break;
1158 
1159                     case BPF_JSET:
1160                               v &= b->s.k;
1161                               break;
1162 
1163                     default:
1164                               abort();
1165                     }
1166                     if (JF(b) != JT(b)) {
1167                               /*
1168                                * XXX - optimizer loop detection.
1169                                */
1170                               opt_state->non_branch_movement_performed = 1;
1171                               opt_state->done = 0;
1172                     }
1173                     if (v)
1174                               JF(b) = JT(b);
1175                     else
1176                               JT(b) = JF(b);
1177           }
1178 }
1179 
1180 /*
1181  * Compute the symbolic value of expression of 's', and update
1182  * anything it defines in the value table 'val'.  If 'alter' is true,
1183  * do various optimizations.  This code would be cleaner if symbolic
1184  * evaluation and code transformations weren't folded together.
1185  */
1186 static void
opt_stmt(opt_state_t * opt_state,struct stmt * s,bpf_u_int32 val[],int alter)1187 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
1188 {
1189           int op;
1190           bpf_u_int32 v;
1191 
1192           switch (s->code) {
1193 
1194           case BPF_LD|BPF_ABS|BPF_W:
1195           case BPF_LD|BPF_ABS|BPF_H:
1196           case BPF_LD|BPF_ABS|BPF_B:
1197                     v = F(opt_state, s->code, s->k, 0L);
1198                     vstore(s, &val[A_ATOM], v, alter);
1199                     break;
1200 
1201           case BPF_LD|BPF_IND|BPF_W:
1202           case BPF_LD|BPF_IND|BPF_H:
1203           case BPF_LD|BPF_IND|BPF_B:
1204                     v = val[X_ATOM];
1205                     if (alter && opt_state->vmap[v].is_const) {
1206                               s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1207                               s->k += opt_state->vmap[v].const_val;
1208                               v = F(opt_state, s->code, s->k, 0L);
1209                               /*
1210                                * XXX - optimizer loop detection.
1211                                */
1212                               opt_state->non_branch_movement_performed = 1;
1213                               opt_state->done = 0;
1214                     }
1215                     else
1216                               v = F(opt_state, s->code, s->k, v);
1217                     vstore(s, &val[A_ATOM], v, alter);
1218                     break;
1219 
1220           case BPF_LD|BPF_LEN:
1221                     v = F(opt_state, s->code, 0L, 0L);
1222                     vstore(s, &val[A_ATOM], v, alter);
1223                     break;
1224 
1225           case BPF_LD|BPF_IMM:
1226                     v = K(s->k);
1227                     vstore(s, &val[A_ATOM], v, alter);
1228                     break;
1229 
1230           case BPF_LDX|BPF_IMM:
1231                     v = K(s->k);
1232                     vstore(s, &val[X_ATOM], v, alter);
1233                     break;
1234 
1235           case BPF_LDX|BPF_MSH|BPF_B:
1236                     v = F(opt_state, s->code, s->k, 0L);
1237                     vstore(s, &val[X_ATOM], v, alter);
1238                     break;
1239 
1240           case BPF_ALU|BPF_NEG:
1241                     if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1242                               s->code = BPF_LD|BPF_IMM;
1243                               /*
1244                                * Do this negation as unsigned arithmetic; that's
1245                                * what modern BPF engines do, and it guarantees
1246                                * that all possible values can be negated.  (Yeah,
1247                                * negating 0x80000000, the minimum signed 32-bit
1248                                * two's-complement value, results in 0x80000000,
1249                                * so it's still negative, but we *should* be doing
1250                                * all unsigned arithmetic here, to match what
1251                                * modern BPF engines do.)
1252                                *
1253                                * Express it as 0U - (unsigned value) so that we
1254                                * don't get compiler warnings about negating an
1255                                * unsigned value and don't get UBSan warnings
1256                                * about the result of negating 0x80000000 being
1257                                * undefined.
1258                                */
1259                               s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
1260                               val[A_ATOM] = K(s->k);
1261                     }
1262                     else
1263                               val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1264                     break;
1265 
1266           case BPF_ALU|BPF_ADD|BPF_K:
1267           case BPF_ALU|BPF_SUB|BPF_K:
1268           case BPF_ALU|BPF_MUL|BPF_K:
1269           case BPF_ALU|BPF_DIV|BPF_K:
1270           case BPF_ALU|BPF_MOD|BPF_K:
1271           case BPF_ALU|BPF_AND|BPF_K:
1272           case BPF_ALU|BPF_OR|BPF_K:
1273           case BPF_ALU|BPF_XOR|BPF_K:
1274           case BPF_ALU|BPF_LSH|BPF_K:
1275           case BPF_ALU|BPF_RSH|BPF_K:
1276                     op = BPF_OP(s->code);
1277                     if (alter) {
1278                               if (s->k == 0) {
1279                                         /*
1280                                          * Optimize operations where the constant
1281                                          * is zero.
1282                                          *
1283                                          * Don't optimize away "sub #0"
1284                                          * as it may be needed later to
1285                                          * fixup the generated math code.
1286                                          *
1287                                          * Fail if we're dividing by zero or taking
1288                                          * a modulus by zero.
1289                                          */
1290                                         if (op == BPF_ADD ||
1291                                             op == BPF_LSH || op == BPF_RSH ||
1292                                             op == BPF_OR || op == BPF_XOR) {
1293                                                   s->code = NOP;
1294                                                   break;
1295                                         }
1296                                         if (op == BPF_MUL || op == BPF_AND) {
1297                                                   s->code = BPF_LD|BPF_IMM;
1298                                                   val[A_ATOM] = K(s->k);
1299                                                   break;
1300                                         }
1301                                         if (op == BPF_DIV)
1302                                                   opt_error(opt_state,
1303                                                       "division by zero");
1304                                         if (op == BPF_MOD)
1305                                                   opt_error(opt_state,
1306                                                       "modulus by zero");
1307                               }
1308                               if (opt_state->vmap[val[A_ATOM]].is_const) {
1309                                         fold_op(opt_state, s, val[A_ATOM], K(s->k));
1310                                         val[A_ATOM] = K(s->k);
1311                                         break;
1312                               }
1313                     }
1314                     val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1315                     break;
1316 
1317           case BPF_ALU|BPF_ADD|BPF_X:
1318           case BPF_ALU|BPF_SUB|BPF_X:
1319           case BPF_ALU|BPF_MUL|BPF_X:
1320           case BPF_ALU|BPF_DIV|BPF_X:
1321           case BPF_ALU|BPF_MOD|BPF_X:
1322           case BPF_ALU|BPF_AND|BPF_X:
1323           case BPF_ALU|BPF_OR|BPF_X:
1324           case BPF_ALU|BPF_XOR|BPF_X:
1325           case BPF_ALU|BPF_LSH|BPF_X:
1326           case BPF_ALU|BPF_RSH|BPF_X:
1327                     op = BPF_OP(s->code);
1328                     if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1329                               if (opt_state->vmap[val[A_ATOM]].is_const) {
1330                                         fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1331                                         val[A_ATOM] = K(s->k);
1332                               }
1333                               else {
1334                                         s->code = BPF_ALU|BPF_K|op;
1335                                         s->k = opt_state->vmap[val[X_ATOM]].const_val;
1336                                         if ((op == BPF_LSH || op == BPF_RSH) &&
1337                                             s->k > 31)
1338                                                   opt_error(opt_state,
1339                                                       "shift by more than 31 bits");
1340                                         /*
1341                                          * XXX - optimizer loop detection.
1342                                          */
1343                                         opt_state->non_branch_movement_performed = 1;
1344                                         opt_state->done = 0;
1345                                         val[A_ATOM] =
1346                                                   F(opt_state, s->code, val[A_ATOM], K(s->k));
1347                               }
1348                               break;
1349                     }
1350                     /*
1351                      * Check if we're doing something to an accumulator
1352                      * that is 0, and simplify.  This may not seem like
1353                      * much of a simplification but it could open up further
1354                      * optimizations.
1355                      * XXX We could also check for mul by 1, etc.
1356                      */
1357                     if (alter && opt_state->vmap[val[A_ATOM]].is_const
1358                         && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1359                               if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1360                                         s->code = BPF_MISC|BPF_TXA;
1361                                         vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1362                                         break;
1363                               }
1364                               else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1365                                          op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1366                                         s->code = BPF_LD|BPF_IMM;
1367                                         s->k = 0;
1368                                         vstore(s, &val[A_ATOM], K(s->k), alter);
1369                                         break;
1370                               }
1371                               else if (op == BPF_NEG) {
1372                                         s->code = NOP;
1373                                         break;
1374                               }
1375                     }
1376                     val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1377                     break;
1378 
1379           case BPF_MISC|BPF_TXA:
1380                     vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1381                     break;
1382 
1383           case BPF_LD|BPF_MEM:
1384                     v = val[s->k];
1385                     if (alter && opt_state->vmap[v].is_const) {
1386                               s->code = BPF_LD|BPF_IMM;
1387                               s->k = opt_state->vmap[v].const_val;
1388                               /*
1389                                * XXX - optimizer loop detection.
1390                                */
1391                               opt_state->non_branch_movement_performed = 1;
1392                               opt_state->done = 0;
1393                     }
1394                     vstore(s, &val[A_ATOM], v, alter);
1395                     break;
1396 
1397           case BPF_MISC|BPF_TAX:
1398                     vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1399                     break;
1400 
1401           case BPF_LDX|BPF_MEM:
1402                     v = val[s->k];
1403                     if (alter && opt_state->vmap[v].is_const) {
1404                               s->code = BPF_LDX|BPF_IMM;
1405                               s->k = opt_state->vmap[v].const_val;
1406                               /*
1407                                * XXX - optimizer loop detection.
1408                                */
1409                               opt_state->non_branch_movement_performed = 1;
1410                               opt_state->done = 0;
1411                     }
1412                     vstore(s, &val[X_ATOM], v, alter);
1413                     break;
1414 
1415           case BPF_ST:
1416                     vstore(s, &val[s->k], val[A_ATOM], alter);
1417                     break;
1418 
1419           case BPF_STX:
1420                     vstore(s, &val[s->k], val[X_ATOM], alter);
1421                     break;
1422           }
1423 }
1424 
1425 static void
deadstmt(opt_state_t * opt_state,register struct stmt * s,register struct stmt * last[])1426 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1427 {
1428           register int atom;
1429 
1430           atom = atomuse(s);
1431           if (atom >= 0) {
1432                     if (atom == AX_ATOM) {
1433                               last[X_ATOM] = 0;
1434                               last[A_ATOM] = 0;
1435                     }
1436                     else
1437                               last[atom] = 0;
1438           }
1439           atom = atomdef(s);
1440           if (atom >= 0) {
1441                     if (last[atom]) {
1442                               /*
1443                                * XXX - optimizer loop detection.
1444                                */
1445                               opt_state->non_branch_movement_performed = 1;
1446                               opt_state->done = 0;
1447                               last[atom]->code = NOP;
1448                     }
1449                     last[atom] = s;
1450           }
1451 }
1452 
1453 static void
opt_deadstores(opt_state_t * opt_state,register struct block * b)1454 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1455 {
1456           register struct slist *s;
1457           register int atom;
1458           struct stmt *last[N_ATOMS];
1459 
1460           memset((char *)last, 0, sizeof last);
1461 
1462           for (s = b->stmts; s != 0; s = s->next)
1463                     deadstmt(opt_state, &s->s, last);
1464           deadstmt(opt_state, &b->s, last);
1465 
1466           for (atom = 0; atom < N_ATOMS; ++atom)
1467                     if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1468                               last[atom]->code = NOP;
1469                               /*
1470                                * XXX - optimizer loop detection.
1471                                */
1472                               opt_state->non_branch_movement_performed = 1;
1473                               opt_state->done = 0;
1474                     }
1475 }
1476 
1477 static void
opt_blk(opt_state_t * opt_state,struct block * b,int do_stmts)1478 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1479 {
1480           struct slist *s;
1481           struct edge *p;
1482           int i;
1483           bpf_u_int32 aval, xval;
1484 
1485 #if 0
1486           for (s = b->stmts; s && s->next; s = s->next)
1487                     if (BPF_CLASS(s->s.code) == BPF_JMP) {
1488                               do_stmts = 0;
1489                               break;
1490                     }
1491 #endif
1492 
1493           /*
1494            * Initialize the atom values.
1495            */
1496           p = b->in_edges;
1497           if (p == 0) {
1498                     /*
1499                      * We have no predecessors, so everything is undefined
1500                      * upon entry to this block.
1501                      */
1502                     memset((char *)b->val, 0, sizeof(b->val));
1503           } else {
1504                     /*
1505                      * Inherit values from our predecessors.
1506                      *
1507                      * First, get the values from the predecessor along the
1508                      * first edge leading to this node.
1509                      */
1510                     memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1511                     /*
1512                      * Now look at all the other nodes leading to this node.
1513                      * If, for the predecessor along that edge, a register
1514                      * has a different value from the one we have (i.e.,
1515                      * control paths are merging, and the merging paths
1516                      * assign different values to that register), give the
1517                      * register the undefined value of 0.
1518                      */
1519                     while ((p = p->next) != NULL) {
1520                               for (i = 0; i < N_ATOMS; ++i)
1521                                         if (b->val[i] != p->pred->val[i])
1522                                                   b->val[i] = 0;
1523                     }
1524           }
1525           aval = b->val[A_ATOM];
1526           xval = b->val[X_ATOM];
1527           for (s = b->stmts; s; s = s->next)
1528                     opt_stmt(opt_state, &s->s, b->val, do_stmts);
1529 
1530           /*
1531            * This is a special case: if we don't use anything from this
1532            * block, and we load the accumulator or index register with a
1533            * value that is already there, or if this block is a return,
1534            * eliminate all the statements.
1535            *
1536            * XXX - what if it does a store?  Presumably that falls under
1537            * the heading of "if we don't use anything from this block",
1538            * i.e., if we use any memory location set to a different
1539            * value by this block, then we use something from this block.
1540            *
1541            * XXX - why does it matter whether we use anything from this
1542            * block?  If the accumulator or index register doesn't change
1543            * its value, isn't that OK even if we use that value?
1544            *
1545            * XXX - if we load the accumulator with a different value,
1546            * and the block ends with a conditional branch, we obviously
1547            * can't eliminate it, as the branch depends on that value.
1548            * For the index register, the conditional branch only depends
1549            * on the index register value if the test is against the index
1550            * register value rather than a constant; if nothing uses the
1551            * value we put into the index register, and we're not testing
1552            * against the index register's value, and there aren't any
1553            * other problems that would keep us from eliminating this
1554            * block, can we eliminate it?
1555            */
1556           if (do_stmts &&
1557               ((b->out_use == 0 &&
1558                 aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1559                 xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1560                BPF_CLASS(b->s.code) == BPF_RET)) {
1561                     if (b->stmts != 0) {
1562                               b->stmts = 0;
1563                               /*
1564                                * XXX - optimizer loop detection.
1565                                */
1566                               opt_state->non_branch_movement_performed = 1;
1567                               opt_state->done = 0;
1568                     }
1569           } else {
1570                     opt_peep(opt_state, b);
1571                     opt_deadstores(opt_state, b);
1572           }
1573           /*
1574            * Set up values for branch optimizer.
1575            */
1576           if (BPF_SRC(b->s.code) == BPF_K)
1577                     b->oval = K(b->s.k);
1578           else
1579                     b->oval = b->val[X_ATOM];
1580           b->et.code = b->s.code;
1581           b->ef.code = -b->s.code;
1582 }
1583 
1584 /*
1585  * Return true if any register that is used on exit from 'succ', has
1586  * an exit value that is different from the corresponding exit value
1587  * from 'b'.
1588  */
1589 static int
use_conflict(struct block * b,struct block * succ)1590 use_conflict(struct block *b, struct block *succ)
1591 {
1592           int atom;
1593           atomset use = succ->out_use;
1594 
1595           if (use == 0)
1596                     return 0;
1597 
1598           for (atom = 0; atom < N_ATOMS; ++atom)
1599                     if (ATOMELEM(use, atom))
1600                               if (b->val[atom] != succ->val[atom])
1601                                         return 1;
1602           return 0;
1603 }
1604 
1605 /*
1606  * Given a block that is the successor of an edge, and an edge that
1607  * dominates that edge, return either a pointer to a child of that
1608  * block (a block to which that block jumps) if that block is a
1609  * candidate to replace the successor of the latter edge or NULL
1610  * if neither of the children of the first block are candidates.
1611  */
1612 static struct block *
fold_edge(struct block * child,struct edge * ep)1613 fold_edge(struct block *child, struct edge *ep)
1614 {
1615           int sense;
1616           bpf_u_int32 aval0, aval1, oval0, oval1;
1617           int code = ep->code;
1618 
1619           if (code < 0) {
1620                     /*
1621                      * This edge is a "branch if false" edge.
1622                      */
1623                     code = -code;
1624                     sense = 0;
1625           } else {
1626                     /*
1627                      * This edge is a "branch if true" edge.
1628                      */
1629                     sense = 1;
1630           }
1631 
1632           /*
1633            * If the opcode for the branch at the end of the block we
1634            * were handed isn't the same as the opcode for the branch
1635            * to which the edge we were handed corresponds, the tests
1636            * for those branches aren't testing the same conditions,
1637            * so the blocks to which the first block branches aren't
1638            * candidates to replace the successor of the edge.
1639            */
1640           if (child->s.code != code)
1641                     return 0;
1642 
1643           aval0 = child->val[A_ATOM];
1644           oval0 = child->oval;
1645           aval1 = ep->pred->val[A_ATOM];
1646           oval1 = ep->pred->oval;
1647 
1648           /*
1649            * If the A register value on exit from the successor block
1650            * isn't the same as the A register value on exit from the
1651            * predecessor of the edge, the blocks to which the first
1652            * block branches aren't candidates to replace the successor
1653            * of the edge.
1654            */
1655           if (aval0 != aval1)
1656                     return 0;
1657 
1658           if (oval0 == oval1)
1659                     /*
1660                      * The operands of the branch instructions are
1661                      * identical, so the branches are testing the
1662                      * same condition, and the result is true if a true
1663                      * branch was taken to get here, otherwise false.
1664                      */
1665                     return sense ? JT(child) : JF(child);
1666 
1667           if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1668                     /*
1669                      * At this point, we only know the comparison if we
1670                      * came down the true branch, and it was an equality
1671                      * comparison with a constant.
1672                      *
1673                      * I.e., if we came down the true branch, and the branch
1674                      * was an equality comparison with a constant, we know the
1675                      * accumulator contains that constant.  If we came down
1676                      * the false branch, or the comparison wasn't with a
1677                      * constant, we don't know what was in the accumulator.
1678                      *
1679                      * We rely on the fact that distinct constants have distinct
1680                      * value numbers.
1681                      */
1682                     return JF(child);
1683 
1684           return 0;
1685 }
1686 
1687 /*
1688  * If we can make this edge go directly to a child of the edge's current
1689  * successor, do so.
1690  */
1691 static void
opt_j(opt_state_t * opt_state,struct edge * ep)1692 opt_j(opt_state_t *opt_state, struct edge *ep)
1693 {
1694           register u_int i, k;
1695           register struct block *target;
1696 
1697           /*
1698            * Does this edge go to a block where, if the test
1699            * at the end of it succeeds, it goes to a block
1700            * that's a leaf node of the DAG, i.e. a return
1701            * statement?
1702            * If so, there's nothing to optimize.
1703            */
1704           if (JT(ep->succ) == 0)
1705                     return;
1706 
1707           /*
1708            * Does this edge go to a block that goes, in turn, to
1709            * the same block regardless of whether the test at the
1710            * end succeeds or fails?
1711            */
1712           if (JT(ep->succ) == JF(ep->succ)) {
1713                     /*
1714                      * Common branch targets can be eliminated, provided
1715                      * there is no data dependency.
1716                      *
1717                      * Check whether any register used on exit from the
1718                      * block to which the successor of this edge goes
1719                      * has a value at that point that's different from
1720                      * the value it has on exit from the predecessor of
1721                      * this edge.  If not, the predecessor of this edge
1722                      * can just go to the block to which the successor
1723                      * of this edge goes, bypassing the successor of this
1724                      * edge, as the successor of this edge isn't doing
1725                      * any calculations whose results are different
1726                      * from what the blocks before it did and isn't
1727                      * doing any tests the results of which matter.
1728                      */
1729                     if (!use_conflict(ep->pred, JT(ep->succ))) {
1730                               /*
1731                                * No, there isn't.
1732                                * Make this edge go to the block to
1733                                * which the successor of that edge
1734                                * goes.
1735                                *
1736                                * XXX - optimizer loop detection.
1737                                */
1738                               opt_state->non_branch_movement_performed = 1;
1739                               opt_state->done = 0;
1740                               ep->succ = JT(ep->succ);
1741                     }
1742           }
1743           /*
1744            * For each edge dominator that matches the successor of this
1745            * edge, promote the edge successor to the its grandchild.
1746            *
1747            * XXX We violate the set abstraction here in favor a reasonably
1748            * efficient loop.
1749            */
1750  top:
1751           for (i = 0; i < opt_state->edgewords; ++i) {
1752                     /* i'th word in the bitset of dominators */
1753                     register bpf_u_int32 x = ep->edom[i];
1754 
1755                     while (x != 0) {
1756                               /* Find the next dominator in that word and mark it as found */
1757                               k = lowest_set_bit(x);
1758                               x &=~ ((bpf_u_int32)1 << k);
1759                               k += i * BITS_PER_WORD;
1760 
1761                               target = fold_edge(ep->succ, opt_state->edges[k]);
1762                               /*
1763                                * We have a candidate to replace the successor
1764                                * of ep.
1765                                *
1766                                * Check that there is no data dependency between
1767                                * nodes that will be violated if we move the edge;
1768                                * i.e., if any register used on exit from the
1769                                * candidate has a value at that point different
1770                                * from the value it has when we exit the
1771                                * predecessor of that edge, there's a data
1772                                * dependency that will be violated.
1773                                */
1774                               if (target != 0 && !use_conflict(ep->pred, target)) {
1775                                         /*
1776                                          * It's safe to replace the successor of
1777                                          * ep; do so, and note that we've made
1778                                          * at least one change.
1779                                          *
1780                                          * XXX - this is one of the operations that
1781                                          * happens when the optimizer gets into
1782                                          * one of those infinite loops.
1783                                          */
1784                                         opt_state->done = 0;
1785                                         ep->succ = target;
1786                                         if (JT(target) != 0)
1787                                                   /*
1788                                                    * Start over unless we hit a leaf.
1789                                                    */
1790                                                   goto top;
1791                                         return;
1792                               }
1793                     }
1794           }
1795 }
1796 
1797 /*
1798  * XXX - is this, and and_pullup(), what's described in section 6.1.2
1799  * "Predicate Assertion Propagation" in the BPF+ paper?
1800  *
1801  * Note that this looks at block dominators, not edge dominators.
1802  * Don't think so.
1803  *
1804  * "A or B" compiles into
1805  *
1806  *          A
1807  *       t / \ f
1808  *        /   B
1809  *       / t / \ f
1810  *      \   /
1811  *       \ /
1812  *        X
1813  *
1814  *
1815  */
1816 static void
or_pullup(opt_state_t * opt_state,struct block * b)1817 or_pullup(opt_state_t *opt_state, struct block *b)
1818 {
1819           bpf_u_int32 val;
1820           int at_top;
1821           struct block *pull;
1822           struct block **diffp, **samep;
1823           struct edge *ep;
1824 
1825           ep = b->in_edges;
1826           if (ep == 0)
1827                     return;
1828 
1829           /*
1830            * Make sure each predecessor loads the same value.
1831            * XXX why?
1832            */
1833           val = ep->pred->val[A_ATOM];
1834           for (ep = ep->next; ep != 0; ep = ep->next)
1835                     if (val != ep->pred->val[A_ATOM])
1836                               return;
1837 
1838           /*
1839            * For the first edge in the list of edges coming into this block,
1840            * see whether the predecessor of that edge comes here via a true
1841            * branch or a false branch.
1842            */
1843           if (JT(b->in_edges->pred) == b)
1844                     diffp = &JT(b->in_edges->pred);         /* jt */
1845           else
1846                     diffp = &JF(b->in_edges->pred);         /* jf */
1847 
1848           /*
1849            * diffp is a pointer to a pointer to the block.
1850            *
1851            * Go down the false chain looking as far as you can,
1852            * making sure that each jump-compare is doing the
1853            * same as the original block.
1854            *
1855            * If you reach the bottom before you reach a
1856            * different jump-compare, just exit.  There's nothing
1857            * to do here.  XXX - no, this version is checking for
1858            * the value leaving the block; that's from the BPF+
1859            * pullup routine.
1860            */
1861           at_top = 1;
1862           for (;;) {
1863                     /*
1864                      * Done if that's not going anywhere XXX
1865                      */
1866                     if (*diffp == 0)
1867                               return;
1868 
1869                     /*
1870                      * Done if that predecessor blah blah blah isn't
1871                      * going the same place we're going XXX
1872                      *
1873                      * Does the true edge of this block point to the same
1874                      * location as the true edge of b?
1875                      */
1876                     if (JT(*diffp) != JT(b))
1877                               return;
1878 
1879                     /*
1880                      * Done if this node isn't a dominator of that
1881                      * node blah blah blah XXX
1882                      *
1883                      * Does b dominate diffp?
1884                      */
1885                     if (!SET_MEMBER((*diffp)->dom, b->id))
1886                               return;
1887 
1888                     /*
1889                      * Break out of the loop if that node's value of A
1890                      * isn't the value of A above XXX
1891                      */
1892                     if ((*diffp)->val[A_ATOM] != val)
1893                               break;
1894 
1895                     /*
1896                      * Get the JF for that node XXX
1897                      * Go down the false path.
1898                      */
1899                     diffp = &JF(*diffp);
1900                     at_top = 0;
1901           }
1902 
1903           /*
1904            * Now that we've found a different jump-compare in a chain
1905            * below b, search further down until we find another
1906            * jump-compare that looks at the original value.  This
1907            * jump-compare should get pulled up.  XXX again we're
1908            * comparing values not jump-compares.
1909            */
1910           samep = &JF(*diffp);
1911           for (;;) {
1912                     /*
1913                      * Done if that's not going anywhere XXX
1914                      */
1915                     if (*samep == 0)
1916                               return;
1917 
1918                     /*
1919                      * Done if that predecessor blah blah blah isn't
1920                      * going the same place we're going XXX
1921                      */
1922                     if (JT(*samep) != JT(b))
1923                               return;
1924 
1925                     /*
1926                      * Done if this node isn't a dominator of that
1927                      * node blah blah blah XXX
1928                      *
1929                      * Does b dominate samep?
1930                      */
1931                     if (!SET_MEMBER((*samep)->dom, b->id))
1932                               return;
1933 
1934                     /*
1935                      * Break out of the loop if that node's value of A
1936                      * is the value of A above XXX
1937                      */
1938                     if ((*samep)->val[A_ATOM] == val)
1939                               break;
1940 
1941                     /* XXX Need to check that there are no data dependencies
1942                        between dp0 and dp1.  Currently, the code generator
1943                        will not produce such dependencies. */
1944                     samep = &JF(*samep);
1945           }
1946 #ifdef notdef
1947           /* XXX This doesn't cover everything. */
1948           for (i = 0; i < N_ATOMS; ++i)
1949                     if ((*samep)->val[i] != pred->val[i])
1950                               return;
1951 #endif
1952           /* Pull up the node. */
1953           pull = *samep;
1954           *samep = JF(pull);
1955           JF(pull) = *diffp;
1956 
1957           /*
1958            * At the top of the chain, each predecessor needs to point at the
1959            * pulled up node.  Inside the chain, there is only one predecessor
1960            * to worry about.
1961            */
1962           if (at_top) {
1963                     for (ep = b->in_edges; ep != 0; ep = ep->next) {
1964                               if (JT(ep->pred) == b)
1965                                         JT(ep->pred) = pull;
1966                               else
1967                                         JF(ep->pred) = pull;
1968                     }
1969           }
1970           else
1971                     *diffp = pull;
1972 
1973           /*
1974            * XXX - this is one of the operations that happens when the
1975            * optimizer gets into one of those infinite loops.
1976            */
1977           opt_state->done = 0;
1978 }
1979 
1980 static void
and_pullup(opt_state_t * opt_state,struct block * b)1981 and_pullup(opt_state_t *opt_state, struct block *b)
1982 {
1983           bpf_u_int32 val;
1984           int at_top;
1985           struct block *pull;
1986           struct block **diffp, **samep;
1987           struct edge *ep;
1988 
1989           ep = b->in_edges;
1990           if (ep == 0)
1991                     return;
1992 
1993           /*
1994            * Make sure each predecessor loads the same value.
1995            */
1996           val = ep->pred->val[A_ATOM];
1997           for (ep = ep->next; ep != 0; ep = ep->next)
1998                     if (val != ep->pred->val[A_ATOM])
1999                               return;
2000 
2001           if (JT(b->in_edges->pred) == b)
2002                     diffp = &JT(b->in_edges->pred);
2003           else
2004                     diffp = &JF(b->in_edges->pred);
2005 
2006           at_top = 1;
2007           for (;;) {
2008                     if (*diffp == 0)
2009                               return;
2010 
2011                     if (JF(*diffp) != JF(b))
2012                               return;
2013 
2014                     if (!SET_MEMBER((*diffp)->dom, b->id))
2015                               return;
2016 
2017                     if ((*diffp)->val[A_ATOM] != val)
2018                               break;
2019 
2020                     diffp = &JT(*diffp);
2021                     at_top = 0;
2022           }
2023           samep = &JT(*diffp);
2024           for (;;) {
2025                     if (*samep == 0)
2026                               return;
2027 
2028                     if (JF(*samep) != JF(b))
2029                               return;
2030 
2031                     if (!SET_MEMBER((*samep)->dom, b->id))
2032                               return;
2033 
2034                     if ((*samep)->val[A_ATOM] == val)
2035                               break;
2036 
2037                     /* XXX Need to check that there are no data dependencies
2038                        between diffp and samep.  Currently, the code generator
2039                        will not produce such dependencies. */
2040                     samep = &JT(*samep);
2041           }
2042 #ifdef notdef
2043           /* XXX This doesn't cover everything. */
2044           for (i = 0; i < N_ATOMS; ++i)
2045                     if ((*samep)->val[i] != pred->val[i])
2046                               return;
2047 #endif
2048           /* Pull up the node. */
2049           pull = *samep;
2050           *samep = JT(pull);
2051           JT(pull) = *diffp;
2052 
2053           /*
2054            * At the top of the chain, each predecessor needs to point at the
2055            * pulled up node.  Inside the chain, there is only one predecessor
2056            * to worry about.
2057            */
2058           if (at_top) {
2059                     for (ep = b->in_edges; ep != 0; ep = ep->next) {
2060                               if (JT(ep->pred) == b)
2061                                         JT(ep->pred) = pull;
2062                               else
2063                                         JF(ep->pred) = pull;
2064                     }
2065           }
2066           else
2067                     *diffp = pull;
2068 
2069           /*
2070            * XXX - this is one of the operations that happens when the
2071            * optimizer gets into one of those infinite loops.
2072            */
2073           opt_state->done = 0;
2074 }
2075 
2076 static void
opt_blks(opt_state_t * opt_state,struct icode * ic,int do_stmts)2077 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2078 {
2079           int i, maxlevel;
2080           struct block *p;
2081 
2082           init_val(opt_state);
2083           maxlevel = ic->root->level;
2084 
2085           find_inedges(opt_state, ic->root);
2086           for (i = maxlevel; i >= 0; --i)
2087                     for (p = opt_state->levels[i]; p; p = p->link)
2088                               opt_blk(opt_state, p, do_stmts);
2089 
2090           if (do_stmts)
2091                     /*
2092                      * No point trying to move branches; it can't possibly
2093                      * make a difference at this point.
2094                      *
2095                      * XXX - this might be after we detect a loop where
2096                      * we were just looping infinitely moving branches
2097                      * in such a fashion that we went through two or more
2098                      * versions of the machine code, eventually returning
2099                      * to the first version.  (We're really not doing a
2100                      * full loop detection, we're just testing for two
2101                      * passes in a row where where we do nothing but
2102                      * move branches.)
2103                      */
2104                     return;
2105 
2106           /*
2107            * Is this what the BPF+ paper describes in sections 6.1.1,
2108            * 6.1.2, and 6.1.3?
2109            */
2110           for (i = 1; i <= maxlevel; ++i) {
2111                     for (p = opt_state->levels[i]; p; p = p->link) {
2112                               opt_j(opt_state, &p->et);
2113                               opt_j(opt_state, &p->ef);
2114                     }
2115           }
2116 
2117           find_inedges(opt_state, ic->root);
2118           for (i = 1; i <= maxlevel; ++i) {
2119                     for (p = opt_state->levels[i]; p; p = p->link) {
2120                               or_pullup(opt_state, p);
2121                               and_pullup(opt_state, p);
2122                     }
2123           }
2124 }
2125 
2126 static inline void
link_inedge(struct edge * parent,struct block * child)2127 link_inedge(struct edge *parent, struct block *child)
2128 {
2129           parent->next = child->in_edges;
2130           child->in_edges = parent;
2131 }
2132 
2133 static void
find_inedges(opt_state_t * opt_state,struct block * root)2134 find_inedges(opt_state_t *opt_state, struct block *root)
2135 {
2136           u_int i;
2137           int level;
2138           struct block *b;
2139 
2140           for (i = 0; i < opt_state->n_blocks; ++i)
2141                     opt_state->blocks[i]->in_edges = 0;
2142 
2143           /*
2144            * Traverse the graph, adding each edge to the predecessor
2145            * list of its successors.  Skip the leaves (i.e. level 0).
2146            */
2147           for (level = root->level; level > 0; --level) {
2148                     for (b = opt_state->levels[level]; b != 0; b = b->link) {
2149                               link_inedge(&b->et, JT(b));
2150                               link_inedge(&b->ef, JF(b));
2151                     }
2152           }
2153 }
2154 
2155 static void
opt_root(struct block ** b)2156 opt_root(struct block **b)
2157 {
2158           struct slist *tmp, *s;
2159 
2160           s = (*b)->stmts;
2161           (*b)->stmts = 0;
2162           while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
2163                     *b = JT(*b);
2164 
2165           tmp = (*b)->stmts;
2166           if (tmp != 0)
2167                     sappend(s, tmp);
2168           (*b)->stmts = s;
2169 
2170           /*
2171            * If the root node is a return, then there is no
2172            * point executing any statements (since the bpf machine
2173            * has no side effects).
2174            */
2175           if (BPF_CLASS((*b)->s.code) == BPF_RET)
2176                     (*b)->stmts = 0;
2177 }
2178 
2179 static void
opt_loop(opt_state_t * opt_state,struct icode * ic,int do_stmts)2180 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
2181 {
2182 
2183 #ifdef BDEBUG
2184           if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2185                     printf("opt_loop(root, %d) begin\n", do_stmts);
2186                     opt_dump(opt_state, ic);
2187           }
2188 #endif
2189 
2190           /*
2191            * XXX - optimizer loop detection.
2192            */
2193           int loop_count = 0;
2194           for (;;) {
2195                     opt_state->done = 1;
2196                     /*
2197                      * XXX - optimizer loop detection.
2198                      */
2199                     opt_state->non_branch_movement_performed = 0;
2200                     find_levels(opt_state, ic);
2201                     find_dom(opt_state, ic->root);
2202                     find_closure(opt_state, ic->root);
2203                     find_ud(opt_state, ic->root);
2204                     find_edom(opt_state, ic->root);
2205                     opt_blks(opt_state, ic, do_stmts);
2206 #ifdef BDEBUG
2207                     if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2208                               printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
2209                               opt_dump(opt_state, ic);
2210                     }
2211 #endif
2212 
2213                     /*
2214                      * Was anything done in this optimizer pass?
2215                      */
2216                     if (opt_state->done) {
2217                               /*
2218                                * No, so we've reached a fixed point.
2219                                * We're done.
2220                                */
2221                               break;
2222                     }
2223 
2224                     /*
2225                      * XXX - was anything done other than branch movement
2226                      * in this pass?
2227                      */
2228                     if (opt_state->non_branch_movement_performed) {
2229                               /*
2230                                * Yes.  Clear any loop-detection counter;
2231                                * we're making some form of progress (assuming
2232                                * we can't get into a cycle doing *other*
2233                                * optimizations...).
2234                                */
2235                               loop_count = 0;
2236                     } else {
2237                               /*
2238                                * No - increment the counter, and quit if
2239                                * it's up to 100.
2240                                */
2241                               loop_count++;
2242                               if (loop_count >= 100) {
2243                                         /*
2244                                          * We've done nothing but branch movement
2245                                          * for 100 passes; we're probably
2246                                          * in a cycle and will never reach a
2247                                          * fixed point.
2248                                          *
2249                                          * XXX - yes, we really need a non-
2250                                          * heuristic way of detecting a cycle.
2251                                          */
2252                                         opt_state->done = 1;
2253                                         break;
2254                               }
2255                     }
2256           }
2257 }
2258 
2259 /*
2260  * Optimize the filter code in its dag representation.
2261  * Return 0 on success, -1 on error.
2262  */
2263 int
bpf_optimize(struct icode * ic,char * errbuf)2264 bpf_optimize(struct icode *ic, char *errbuf)
2265 {
2266           opt_state_t opt_state;
2267 
2268           memset(&opt_state, 0, sizeof(opt_state));
2269           opt_state.errbuf = errbuf;
2270           opt_state.non_branch_movement_performed = 0;
2271           if (setjmp(opt_state.top_ctx)) {
2272                     opt_cleanup(&opt_state);
2273                     return -1;
2274           }
2275           opt_init(&opt_state, ic);
2276           opt_loop(&opt_state, ic, 0);
2277           opt_loop(&opt_state, ic, 1);
2278           intern_blocks(&opt_state, ic);
2279 #ifdef BDEBUG
2280           if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2281                     printf("after intern_blocks()\n");
2282                     opt_dump(&opt_state, ic);
2283           }
2284 #endif
2285           opt_root(&ic->root);
2286 #ifdef BDEBUG
2287           if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
2288                     printf("after opt_root()\n");
2289                     opt_dump(&opt_state, ic);
2290           }
2291 #endif
2292           opt_cleanup(&opt_state);
2293           return 0;
2294 }
2295 
2296 static void
make_marks(struct icode * ic,struct block * p)2297 make_marks(struct icode *ic, struct block *p)
2298 {
2299           if (!isMarked(ic, p)) {
2300                     Mark(ic, p);
2301                     if (BPF_CLASS(p->s.code) != BPF_RET) {
2302                               make_marks(ic, JT(p));
2303                               make_marks(ic, JF(p));
2304                     }
2305           }
2306 }
2307 
2308 /*
2309  * Mark code array such that isMarked(ic->cur_mark, i) is true
2310  * only for nodes that are alive.
2311  */
2312 static void
mark_code(struct icode * ic)2313 mark_code(struct icode *ic)
2314 {
2315           ic->cur_mark += 1;
2316           make_marks(ic, ic->root);
2317 }
2318 
2319 /*
2320  * True iff the two stmt lists load the same value from the packet into
2321  * the accumulator.
2322  */
2323 static int
eq_slist(struct slist * x,struct slist * y)2324 eq_slist(struct slist *x, struct slist *y)
2325 {
2326           for (;;) {
2327                     while (x && x->s.code == NOP)
2328                               x = x->next;
2329                     while (y && y->s.code == NOP)
2330                               y = y->next;
2331                     if (x == 0)
2332                               return y == 0;
2333                     if (y == 0)
2334                               return x == 0;
2335                     if (x->s.code != y->s.code || x->s.k != y->s.k)
2336                               return 0;
2337                     x = x->next;
2338                     y = y->next;
2339           }
2340 }
2341 
2342 static inline int
eq_blk(struct block * b0,struct block * b1)2343 eq_blk(struct block *b0, struct block *b1)
2344 {
2345           if (b0->s.code == b1->s.code &&
2346               b0->s.k == b1->s.k &&
2347               b0->et.succ == b1->et.succ &&
2348               b0->ef.succ == b1->ef.succ)
2349                     return eq_slist(b0->stmts, b1->stmts);
2350           return 0;
2351 }
2352 
2353 static void
intern_blocks(opt_state_t * opt_state,struct icode * ic)2354 intern_blocks(opt_state_t *opt_state, struct icode *ic)
2355 {
2356           struct block *p;
2357           u_int i, j;
2358           int done1; /* don't shadow global */
2359  top:
2360           done1 = 1;
2361           for (i = 0; i < opt_state->n_blocks; ++i)
2362                     opt_state->blocks[i]->link = 0;
2363 
2364           mark_code(ic);
2365 
2366           for (i = opt_state->n_blocks - 1; i != 0; ) {
2367                     --i;
2368                     if (!isMarked(ic, opt_state->blocks[i]))
2369                               continue;
2370                     for (j = i + 1; j < opt_state->n_blocks; ++j) {
2371                               if (!isMarked(ic, opt_state->blocks[j]))
2372                                         continue;
2373                               if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2374                                         opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2375                                                   opt_state->blocks[j]->link : opt_state->blocks[j];
2376                                         break;
2377                               }
2378                     }
2379           }
2380           for (i = 0; i < opt_state->n_blocks; ++i) {
2381                     p = opt_state->blocks[i];
2382                     if (JT(p) == 0)
2383                               continue;
2384                     if (JT(p)->link) {
2385                               done1 = 0;
2386                               JT(p) = JT(p)->link;
2387                     }
2388                     if (JF(p)->link) {
2389                               done1 = 0;
2390                               JF(p) = JF(p)->link;
2391                     }
2392           }
2393           if (!done1)
2394                     goto top;
2395 }
2396 
2397 static void
opt_cleanup(opt_state_t * opt_state)2398 opt_cleanup(opt_state_t *opt_state)
2399 {
2400           free((void *)opt_state->vnode_base);
2401           free((void *)opt_state->vmap);
2402           free((void *)opt_state->edges);
2403           free((void *)opt_state->space);
2404           free((void *)opt_state->levels);
2405           free((void *)opt_state->blocks);
2406 }
2407 
2408 /*
2409  * For optimizer errors.
2410  */
2411 static void PCAP_NORETURN
opt_error(opt_state_t * opt_state,const char * fmt,...)2412 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2413 {
2414           va_list ap;
2415 
2416           if (opt_state->errbuf != NULL) {
2417                     va_start(ap, fmt);
2418                     (void)vsnprintf(opt_state->errbuf,
2419                         PCAP_ERRBUF_SIZE, fmt, ap);
2420                     va_end(ap);
2421           }
2422           longjmp(opt_state->top_ctx, 1);
2423           /* NOTREACHED */
2424 }
2425 
2426 /*
2427  * Return the number of stmts in 's'.
2428  */
2429 static u_int
slength(struct slist * s)2430 slength(struct slist *s)
2431 {
2432           u_int n = 0;
2433 
2434           for (; s; s = s->next)
2435                     if (s->s.code != NOP)
2436                               ++n;
2437           return n;
2438 }
2439 
2440 /*
2441  * Return the number of nodes reachable by 'p'.
2442  * All nodes should be initially unmarked.
2443  */
2444 static int
count_blocks(struct icode * ic,struct block * p)2445 count_blocks(struct icode *ic, struct block *p)
2446 {
2447           if (p == 0 || isMarked(ic, p))
2448                     return 0;
2449           Mark(ic, p);
2450           return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2451 }
2452 
2453 /*
2454  * Do a depth first search on the flow graph, numbering the
2455  * the basic blocks, and entering them into the 'blocks' array.`
2456  */
2457 static void
number_blks_r(opt_state_t * opt_state,struct icode * ic,struct block * p)2458 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2459 {
2460           u_int n;
2461 
2462           if (p == 0 || isMarked(ic, p))
2463                     return;
2464 
2465           Mark(ic, p);
2466           n = opt_state->n_blocks++;
2467           if (opt_state->n_blocks == 0) {
2468                     /*
2469                      * Overflow.
2470                      */
2471                     opt_error(opt_state, "filter is too complex to optimize");
2472           }
2473           p->id = n;
2474           opt_state->blocks[n] = p;
2475 
2476           number_blks_r(opt_state, ic, JT(p));
2477           number_blks_r(opt_state, ic, JF(p));
2478 }
2479 
2480 /*
2481  * Return the number of stmts in the flowgraph reachable by 'p'.
2482  * The nodes should be unmarked before calling.
2483  *
2484  * Note that "stmts" means "instructions", and that this includes
2485  *
2486  *        side-effect statements in 'p' (slength(p->stmts));
2487  *
2488  *        statements in the true branch from 'p' (count_stmts(JT(p)));
2489  *
2490  *        statements in the false branch from 'p' (count_stmts(JF(p)));
2491  *
2492  *        the conditional jump itself (1);
2493  *
2494  *        an extra long jump if the true branch requires it (p->longjt);
2495  *
2496  *        an extra long jump if the false branch requires it (p->longjf).
2497  */
2498 static u_int
count_stmts(struct icode * ic,struct block * p)2499 count_stmts(struct icode *ic, struct block *p)
2500 {
2501           u_int n;
2502 
2503           if (p == 0 || isMarked(ic, p))
2504                     return 0;
2505           Mark(ic, p);
2506           n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2507           return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2508 }
2509 
2510 /*
2511  * Allocate memory.  All allocation is done before optimization
2512  * is begun.  A linear bound on the size of all data structures is computed
2513  * from the total number of blocks and/or statements.
2514  */
2515 static void
opt_init(opt_state_t * opt_state,struct icode * ic)2516 opt_init(opt_state_t *opt_state, struct icode *ic)
2517 {
2518           bpf_u_int32 *p;
2519           int i, n, max_stmts;
2520           u_int product;
2521           size_t block_memsize, edge_memsize;
2522 
2523           /*
2524            * First, count the blocks, so we can malloc an array to map
2525            * block number to block.  Then, put the blocks into the array.
2526            */
2527           unMarkAll(ic);
2528           n = count_blocks(ic, ic->root);
2529           opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2530           if (opt_state->blocks == NULL)
2531                     opt_error(opt_state, "malloc");
2532           unMarkAll(ic);
2533           opt_state->n_blocks = 0;
2534           number_blks_r(opt_state, ic, ic->root);
2535 
2536           /*
2537            * This "should not happen".
2538            */
2539           if (opt_state->n_blocks == 0)
2540                     opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue");
2541 
2542           opt_state->n_edges = 2 * opt_state->n_blocks;
2543           if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
2544                     /*
2545                      * Overflow.
2546                      */
2547                     opt_error(opt_state, "filter is too complex to optimize");
2548           }
2549           opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2550           if (opt_state->edges == NULL) {
2551                     opt_error(opt_state, "malloc");
2552           }
2553 
2554           /*
2555            * The number of levels is bounded by the number of nodes.
2556            */
2557           opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2558           if (opt_state->levels == NULL) {
2559                     opt_error(opt_state, "malloc");
2560           }
2561 
2562           opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
2563           opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
2564 
2565           /*
2566            * Make sure opt_state->n_blocks * opt_state->nodewords fits
2567            * in a u_int; we use it as a u_int number-of-iterations
2568            * value.
2569            */
2570           product = opt_state->n_blocks * opt_state->nodewords;
2571           if ((product / opt_state->n_blocks) != opt_state->nodewords) {
2572                     /*
2573                      * XXX - just punt and don't try to optimize?
2574                      * In practice, this is unlikely to happen with
2575                      * a normal filter.
2576                      */
2577                     opt_error(opt_state, "filter is too complex to optimize");
2578           }
2579 
2580           /*
2581            * Make sure the total memory required for that doesn't
2582            * overflow.
2583            */
2584           block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
2585           if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
2586                     opt_error(opt_state, "filter is too complex to optimize");
2587           }
2588 
2589           /*
2590            * Make sure opt_state->n_edges * opt_state->edgewords fits
2591            * in a u_int; we use it as a u_int number-of-iterations
2592            * value.
2593            */
2594           product = opt_state->n_edges * opt_state->edgewords;
2595           if ((product / opt_state->n_edges) != opt_state->edgewords) {
2596                     opt_error(opt_state, "filter is too complex to optimize");
2597           }
2598 
2599           /*
2600            * Make sure the total memory required for that doesn't
2601            * overflow.
2602            */
2603           edge_memsize = (size_t)product * sizeof(*opt_state->space);
2604           if (edge_memsize / product != sizeof(*opt_state->space)) {
2605                     opt_error(opt_state, "filter is too complex to optimize");
2606           }
2607 
2608           /*
2609            * Make sure the total memory required for both of them dosn't
2610            * overflow.
2611            */
2612           if (block_memsize > SIZE_MAX - edge_memsize) {
2613                     opt_error(opt_state, "filter is too complex to optimize");
2614           }
2615 
2616           /* XXX */
2617           opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
2618           if (opt_state->space == NULL) {
2619                     opt_error(opt_state, "malloc");
2620           }
2621           p = opt_state->space;
2622           opt_state->all_dom_sets = p;
2623           for (i = 0; i < n; ++i) {
2624                     opt_state->blocks[i]->dom = p;
2625                     p += opt_state->nodewords;
2626           }
2627           opt_state->all_closure_sets = p;
2628           for (i = 0; i < n; ++i) {
2629                     opt_state->blocks[i]->closure = p;
2630                     p += opt_state->nodewords;
2631           }
2632           opt_state->all_edge_sets = p;
2633           for (i = 0; i < n; ++i) {
2634                     register struct block *b = opt_state->blocks[i];
2635 
2636                     b->et.edom = p;
2637                     p += opt_state->edgewords;
2638                     b->ef.edom = p;
2639                     p += opt_state->edgewords;
2640                     b->et.id = i;
2641                     opt_state->edges[i] = &b->et;
2642                     b->ef.id = opt_state->n_blocks + i;
2643                     opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2644                     b->et.pred = b;
2645                     b->ef.pred = b;
2646           }
2647           max_stmts = 0;
2648           for (i = 0; i < n; ++i)
2649                     max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2650           /*
2651            * We allocate at most 3 value numbers per statement,
2652            * so this is an upper bound on the number of valnodes
2653            * we'll need.
2654            */
2655           opt_state->maxval = 3 * max_stmts;
2656           opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2657           if (opt_state->vmap == NULL) {
2658                     opt_error(opt_state, "malloc");
2659           }
2660           opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2661           if (opt_state->vnode_base == NULL) {
2662                     opt_error(opt_state, "malloc");
2663           }
2664 }
2665 
2666 /*
2667  * This is only used when supporting optimizer debugging.  It is
2668  * global state, so do *not* do more than one compile in parallel
2669  * and expect it to provide meaningful information.
2670  */
2671 #ifdef BDEBUG
2672 int bids[NBIDS];
2673 #endif
2674 
2675 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2676     PCAP_PRINTFLIKE(2, 3);
2677 
2678 /*
2679  * Returns true if successful.  Returns false if a branch has
2680  * an offset that is too large.  If so, we have marked that
2681  * branch so that on a subsequent iteration, it will be treated
2682  * properly.
2683  */
2684 static int
convert_code_r(conv_state_t * conv_state,struct icode * ic,struct block * p)2685 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2686 {
2687           struct bpf_insn *dst;
2688           struct slist *src;
2689           u_int slen;
2690           u_int off;
2691           struct slist **offset = NULL;
2692 
2693           if (p == 0 || isMarked(ic, p))
2694                     return (1);
2695           Mark(ic, p);
2696 
2697           if (convert_code_r(conv_state, ic, JF(p)) == 0)
2698                     return (0);
2699           if (convert_code_r(conv_state, ic, JT(p)) == 0)
2700                     return (0);
2701 
2702           slen = slength(p->stmts);
2703           dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2704                     /* inflate length by any extra jumps */
2705 
2706           p->offset = (int)(dst - conv_state->fstart);
2707 
2708           /* generate offset[] for convenience  */
2709           if (slen) {
2710                     offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2711                     if (!offset) {
2712                               conv_error(conv_state, "not enough core");
2713                               /*NOTREACHED*/
2714                     }
2715           }
2716           src = p->stmts;
2717           for (off = 0; off < slen && src; off++) {
2718 #if 0
2719                     printf("off=%d src=%x\n", off, src);
2720 #endif
2721                     offset[off] = src;
2722                     src = src->next;
2723           }
2724 
2725           off = 0;
2726           for (src = p->stmts; src; src = src->next) {
2727                     if (src->s.code == NOP)
2728                               continue;
2729                     dst->code = (u_short)src->s.code;
2730                     dst->k = src->s.k;
2731 
2732                     /* fill block-local relative jump */
2733                     if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2734 #if 0
2735                               if (src->s.jt || src->s.jf) {
2736                                         free(offset);
2737                                         conv_error(conv_state, "illegal jmp destination");
2738                                         /*NOTREACHED*/
2739                               }
2740 #endif
2741                               goto filled;
2742                     }
2743                     if (off == slen - 2)          /*???*/
2744                               goto filled;
2745 
2746               {
2747                     u_int i;
2748                     int jt, jf;
2749                     const char ljerr[] = "%s for block-local relative jump: off=%d";
2750 
2751 #if 0
2752                     printf("code=%x off=%d %x %x\n", src->s.code,
2753                               off, src->s.jt, src->s.jf);
2754 #endif
2755 
2756                     if (!src->s.jt || !src->s.jf) {
2757                               free(offset);
2758                               conv_error(conv_state, ljerr, "no jmp destination", off);
2759                               /*NOTREACHED*/
2760                     }
2761 
2762                     jt = jf = 0;
2763                     for (i = 0; i < slen; i++) {
2764                               if (offset[i] == src->s.jt) {
2765                                         if (jt) {
2766                                                   free(offset);
2767                                                   conv_error(conv_state, ljerr, "multiple matches", off);
2768                                                   /*NOTREACHED*/
2769                                         }
2770 
2771                                         if (i - off - 1 >= 256) {
2772                                                   free(offset);
2773                                                   conv_error(conv_state, ljerr, "out-of-range jump", off);
2774                                                   /*NOTREACHED*/
2775                                         }
2776                                         dst->jt = (u_char)(i - off - 1);
2777                                         jt++;
2778                               }
2779                               if (offset[i] == src->s.jf) {
2780                                         if (jf) {
2781                                                   free(offset);
2782                                                   conv_error(conv_state, ljerr, "multiple matches", off);
2783                                                   /*NOTREACHED*/
2784                                         }
2785                                         if (i - off - 1 >= 256) {
2786                                                   free(offset);
2787                                                   conv_error(conv_state, ljerr, "out-of-range jump", off);
2788                                                   /*NOTREACHED*/
2789                                         }
2790                                         dst->jf = (u_char)(i - off - 1);
2791                                         jf++;
2792                               }
2793                     }
2794                     if (!jt || !jf) {
2795                               free(offset);
2796                               conv_error(conv_state, ljerr, "no destination found", off);
2797                               /*NOTREACHED*/
2798                     }
2799               }
2800 filled:
2801                     ++dst;
2802                     ++off;
2803           }
2804           if (offset)
2805                     free(offset);
2806 
2807 #ifdef BDEBUG
2808           if (dst - conv_state->fstart < NBIDS)
2809                     bids[dst - conv_state->fstart] = p->id + 1;
2810 #endif
2811           dst->code = (u_short)p->s.code;
2812           dst->k = p->s.k;
2813           if (JT(p)) {
2814                     /* number of extra jumps inserted */
2815                     u_char extrajmps = 0;
2816                     off = JT(p)->offset - (p->offset + slen) - 1;
2817                     if (off >= 256) {
2818                         /* offset too large for branch, must add a jump */
2819                         if (p->longjt == 0) {
2820                               /* mark this instruction and retry */
2821                               p->longjt++;
2822                               return(0);
2823                         }
2824                         dst->jt = extrajmps;
2825                         extrajmps++;
2826                         dst[extrajmps].code = BPF_JMP|BPF_JA;
2827                         dst[extrajmps].k = off - extrajmps;
2828                     }
2829                     else
2830                         dst->jt = (u_char)off;
2831                     off = JF(p)->offset - (p->offset + slen) - 1;
2832                     if (off >= 256) {
2833                         /* offset too large for branch, must add a jump */
2834                         if (p->longjf == 0) {
2835                               /* mark this instruction and retry */
2836                               p->longjf++;
2837                               return(0);
2838                         }
2839                         /* branch if F to following jump */
2840                         /* if two jumps are inserted, F goes to second one */
2841                         dst->jf = extrajmps;
2842                         extrajmps++;
2843                         dst[extrajmps].code = BPF_JMP|BPF_JA;
2844                         dst[extrajmps].k = off - extrajmps;
2845                     }
2846                     else
2847                         dst->jf = (u_char)off;
2848           }
2849           return (1);
2850 }
2851 
2852 
2853 /*
2854  * Convert flowgraph intermediate representation to the
2855  * BPF array representation.  Set *lenp to the number of instructions.
2856  *
2857  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2858  * not* do free(fp) before returning fp; doing so would make no sense,
2859  * as the BPF array pointed to by the return value of icode_to_fcode()
2860  * must be valid - it's being returned for use in a bpf_program structure.
2861  *
2862  * If it appears that icode_to_fcode() is leaking, the problem is that
2863  * the program using pcap_compile() is failing to free the memory in
2864  * the BPF program when it's done - the leak is in the program, not in
2865  * the routine that happens to be allocating the memory.  (By analogy, if
2866  * a program calls fopen() without ever calling fclose() on the FILE *,
2867  * it will leak the FILE structure; the leak is not in fopen(), it's in
2868  * the program.)  Change the program to use pcap_freecode() when it's
2869  * done with the filter program.  See the pcap man page.
2870  */
2871 struct bpf_insn *
icode_to_fcode(struct icode * ic,struct block * root,u_int * lenp,char * errbuf)2872 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2873     char *errbuf)
2874 {
2875           u_int n;
2876           struct bpf_insn *fp;
2877           conv_state_t conv_state;
2878 
2879           conv_state.fstart = NULL;
2880           conv_state.errbuf = errbuf;
2881           if (setjmp(conv_state.top_ctx) != 0) {
2882                     free(conv_state.fstart);
2883                     return NULL;
2884           }
2885 
2886           /*
2887            * Loop doing convert_code_r() until no branches remain
2888            * with too-large offsets.
2889            */
2890           for (;;) {
2891               unMarkAll(ic);
2892               n = *lenp = count_stmts(ic, root);
2893 
2894               fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2895               if (fp == NULL) {
2896                     (void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
2897                         "malloc");
2898                     free(fp);
2899                     return NULL;
2900               }
2901               memset((char *)fp, 0, sizeof(*fp) * n);
2902               conv_state.fstart = fp;
2903               conv_state.ftail = fp + n;
2904 
2905               unMarkAll(ic);
2906               if (convert_code_r(&conv_state, ic, root))
2907                     break;
2908               free(fp);
2909           }
2910 
2911           return fp;
2912 }
2913 
2914 /*
2915  * For iconv_to_fconv() errors.
2916  */
2917 static void PCAP_NORETURN
conv_error(conv_state_t * conv_state,const char * fmt,...)2918 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2919 {
2920           va_list ap;
2921 
2922           va_start(ap, fmt);
2923           (void)vsnprintf(conv_state->errbuf,
2924               PCAP_ERRBUF_SIZE, fmt, ap);
2925           va_end(ap);
2926           longjmp(conv_state->top_ctx, 1);
2927           /* NOTREACHED */
2928 }
2929 
2930 /*
2931  * Make a copy of a BPF program and put it in the "fcode" member of
2932  * a "pcap_t".
2933  *
2934  * If we fail to allocate memory for the copy, fill in the "errbuf"
2935  * member of the "pcap_t" with an error message, and return -1;
2936  * otherwise, return 0.
2937  */
2938 int
install_bpf_program(pcap_t * p,struct bpf_program * fp)2939 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2940 {
2941           size_t prog_size;
2942 
2943           /*
2944            * Validate the program.
2945            */
2946           if (!pcap_validate_filter(fp->bf_insns, fp->bf_len)) {
2947                     snprintf(p->errbuf, sizeof(p->errbuf),
2948                               "BPF program is not valid");
2949                     return (-1);
2950           }
2951 
2952           /*
2953            * Free up any already installed program.
2954            */
2955           pcap_freecode(&p->fcode);
2956 
2957           prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2958           p->fcode.bf_len = fp->bf_len;
2959           p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2960           if (p->fcode.bf_insns == NULL) {
2961                     pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2962                         errno, "malloc");
2963                     return (-1);
2964           }
2965           memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2966           return (0);
2967 }
2968 
2969 #ifdef BDEBUG
2970 static void
dot_dump_node(struct icode * ic,struct block * block,struct bpf_program * prog,FILE * out)2971 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2972     FILE *out)
2973 {
2974           int icount, noffset;
2975           int i;
2976 
2977           if (block == NULL || isMarked(ic, block))
2978                     return;
2979           Mark(ic, block);
2980 
2981           icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2982           noffset = min(block->offset + icount, (int)prog->bf_len);
2983 
2984           fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
2985           for (i = block->offset; i < noffset; i++) {
2986                     fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2987           }
2988           fprintf(out, "\" tooltip=\"");
2989           for (i = 0; i < BPF_MEMWORDS; i++)
2990                     if (block->val[i] != VAL_UNKNOWN)
2991                               fprintf(out, "val[%d]=%d ", i, block->val[i]);
2992           fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2993           fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2994           fprintf(out, "\"");
2995           if (JT(block) == NULL)
2996                     fprintf(out, ", peripheries=2");
2997           fprintf(out, "];\n");
2998 
2999           dot_dump_node(ic, JT(block), prog, out);
3000           dot_dump_node(ic, JF(block), prog, out);
3001 }
3002 
3003 static void
dot_dump_edge(struct icode * ic,struct block * block,FILE * out)3004 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
3005 {
3006           if (block == NULL || isMarked(ic, block))
3007                     return;
3008           Mark(ic, block);
3009 
3010           if (JT(block)) {
3011                     fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
3012                                         block->id, JT(block)->id);
3013                     fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
3014                                  block->id, JF(block)->id);
3015           }
3016           dot_dump_edge(ic, JT(block), out);
3017           dot_dump_edge(ic, JF(block), out);
3018 }
3019 
3020 /* Output the block CFG using graphviz/DOT language
3021  * In the CFG, block's code, value index for each registers at EXIT,
3022  * and the jump relationship is show.
3023  *
3024  * example DOT for BPF `ip src host 1.1.1.1' is:
3025     digraph BPF {
3026           block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2          jf 5" tooltip="val[A]=0 val[X]=0"];
3027           block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4          jf 5" tooltip="val[A]=0 val[X]=0"];
3028           block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
3029           block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
3030           "block0":se -> "block1":n [label="T"];
3031           "block0":sw -> "block3":n [label="F"];
3032           "block1":se -> "block2":n [label="T"];
3033           "block1":sw -> "block3":n [label="F"];
3034     }
3035  *
3036  *  After install graphviz on https://www.graphviz.org/, save it as bpf.dot
3037  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
3038  */
3039 static int
dot_dump(struct icode * ic,char * errbuf)3040 dot_dump(struct icode *ic, char *errbuf)
3041 {
3042           struct bpf_program f;
3043           FILE *out = stdout;
3044 
3045           memset(bids, 0, sizeof bids);
3046           f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3047           if (f.bf_insns == NULL)
3048                     return -1;
3049 
3050           fprintf(out, "digraph BPF {\n");
3051           unMarkAll(ic);
3052           dot_dump_node(ic, ic->root, &f, out);
3053           unMarkAll(ic);
3054           dot_dump_edge(ic, ic->root, out);
3055           fprintf(out, "}\n");
3056 
3057           free((char *)f.bf_insns);
3058           return 0;
3059 }
3060 
3061 static int
plain_dump(struct icode * ic,char * errbuf)3062 plain_dump(struct icode *ic, char *errbuf)
3063 {
3064           struct bpf_program f;
3065 
3066           memset(bids, 0, sizeof bids);
3067           f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
3068           if (f.bf_insns == NULL)
3069                     return -1;
3070           bpf_dump(&f, 1);
3071           putchar('\n');
3072           free((char *)f.bf_insns);
3073           return 0;
3074 }
3075 
3076 static void
opt_dump(opt_state_t * opt_state,struct icode * ic)3077 opt_dump(opt_state_t *opt_state, struct icode *ic)
3078 {
3079           int status;
3080           char errbuf[PCAP_ERRBUF_SIZE];
3081 
3082           /*
3083            * If the CFG, in DOT format, is requested, output it rather than
3084            * the code that would be generated from that graph.
3085            */
3086           if (pcap_print_dot_graph)
3087                     status = dot_dump(ic, errbuf);
3088           else
3089                     status = plain_dump(ic, errbuf);
3090           if (status == -1)
3091                     opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
3092 }
3093 #endif
3094