1 /*
2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Some portions Copyright (C) 2010-2013 Sourcefire, Inc.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that: (1) source code distributions
9  * retain the above copyright notice and this paragraph in its entirety, (2)
10  * distributions including binary code include the above copyright notice and
11  * this paragraph in its entirety in the documentation or other materials
12  * provided with the distribution, and (3) all advertising materials mentioning
13  * features or use of this software display the following acknowledgement:
14  * ``This product includes software developed by the University of California,
15  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
16  * the University nor the names of its contributors may be used to endorse
17  * or promote products derived from this software without specific prior
18  * written permission.
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
20  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
21  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22  *
23  *  Optimization module for tcpdump intermediate representation.
24  */
25 #ifndef lint
26 static const char __attribute__ ((unused)) rcsid[] =
27     "@(#) $Header: /usr/cvsroot/sfeng/ims/src/libraries/daq/daq/sfbpf/sf_optimize.c,v 1.3 2013/06/28 14:57:30 rcombs Exp $ (LBL)";
28 #endif
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #ifdef WIN32
35 #include "win32-stdinc.h"
36 #else /* WIN32 */
37 #if HAVE_INTTYPES_H
38 #include <inttypes.h>
39 #elif HAVE_STDINT_H
40 #include <stdint.h>
41 #endif
42 #ifdef HAVE_SYS_BITYPES_H
43 #include <sys/bitypes.h>
44 #endif
45 #include <sys/types.h>
46 #endif /* WIN32 */
47 
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <memory.h>
51 #include <string.h>
52 
53 #include <errno.h>
54 
55 #include "sfbpf-int.h"
56 
57 #include "gencode.h"
58 
59 #ifdef BDEBUG
60 extern int dflag;
61 #endif
62 
63 #if defined(MSDOS) && !defined(__DJGPP__)
64 extern int _w32_ffs(int mask);
65 #define ffs _w32_ffs
66 #endif
67 
68 #if defined(WIN32) && defined (_MSC_VER)
69 int ffs(int mask);
70 #endif
71 
72 /*
73  * Represents a deleted instruction.
74  */
75 #define NOP -1
76 
77 /*
78  * Register numbers for use-def values.
79  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
80  * location.  A_ATOM is the accumulator and X_ATOM is the index
81  * register.
82  */
83 #define A_ATOM BPF_MEMWORDS
84 #define X_ATOM (BPF_MEMWORDS+1)
85 
86 /*
87  * This define is used to represent *both* the accumulator and
88  * x register in use-def computations.
89  * Currently, the use-def code assumes only one definition per instruction.
90  */
91 #define AX_ATOM N_ATOMS
92 
93 /*
94  * A flag to indicate that further optimization is needed.
95  * Iterative passes are continued until a given pass yields no
96  * branch movement.
97  */
98 static __thread int done;
99 
100 /*
101  * A block is marked if only if its mark equals the current mark.
102  * Rather than traverse the code array, marking each item, 'cur_mark' is
103  * incremented.  This automatically makes each element unmarked.
104  */
105 static __thread int cur_mark;
106 #define isMarked(p) ((p)->mark == cur_mark)
107 #define unMarkAll() cur_mark += 1
108 #define Mark(p) ((p)->mark = cur_mark)
109 
110 static void opt_init(struct block *);
111 static void opt_cleanup(void);
112 
113 static void make_marks(struct block *);
114 static void mark_code(struct block *);
115 
116 static void intern_blocks(struct block *);
117 
118 static int eq_slist(struct slist *, struct slist *);
119 
120 static void find_levels_r(struct block *);
121 
122 static void find_levels(struct block *);
123 static void find_dom(struct block *);
124 static void propedom(struct edge *);
125 static void find_edom(struct block *);
126 static void find_closure(struct block *);
127 static int atomuse(struct stmt *);
128 static int atomdef(struct stmt *);
129 static void compute_local_ud(struct block *);
130 static void find_ud(struct block *);
131 static void init_val(void);
132 static int F(int, int, int);
133 static inline void vstore(struct stmt *, int *, int, int);
134 static void opt_blk(struct block *, int);
135 static int use_conflict(struct block *, struct block *);
136 static void opt_j(struct edge *);
137 static void or_pullup(struct block *);
138 static void and_pullup(struct block *);
139 static void opt_blks(struct block *, int);
140 static inline void link_inedge(struct edge *, struct block *);
141 static void find_inedges(struct block *);
142 static void opt_root(struct block **);
143 static void opt_loop(struct block *, int);
144 static void fold_op(struct stmt *, int, int);
145 static inline struct slist *this_op(struct slist *);
146 static void opt_not(struct block *);
147 static void opt_peep(struct block *);
148 static void opt_stmt(struct stmt *, int[], int);
149 static void deadstmt(struct stmt *, struct stmt *[]);
150 static void opt_deadstores(struct block *);
151 static struct block *fold_edge(struct block *, struct edge *);
152 static inline int eq_blk(struct block *, struct block *);
153 static int slength(struct slist *);
154 static int count_blocks(struct block *);
155 static void number_blks_r(struct block *);
156 static int count_stmts(struct block *);
157 static int convert_code_r(struct block *);
158 #ifdef BDEBUG
159 static void opt_dump(struct block *);
160 #endif
161 
162 static __thread int n_blocks;
163 __thread struct block **blocks;
164 static __thread int n_edges;
165 __thread struct edge **edges;
166 
167 /*
168  * A bit vector set representation of the dominators.
169  * We round up the set size to the next power of two.
170  */
171 static __thread int nodewords;
172 static __thread int edgewords;
173 __thread struct block **levels;
174 __thread bpf_u_int32 *space;
175 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
176 /*
177  * True if a is in uset {p}
178  */
179 #define SET_MEMBER(p, a) \
180 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
181 
182 /*
183  * Add 'a' to uset p.
184  */
185 #define SET_INSERT(p, a) \
186 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
187 
188 /*
189  * Delete 'a' from uset p.
190  */
191 #define SET_DELETE(p, a) \
192 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
193 
194 /*
195  * a := a intersect b
196  */
197 #define SET_INTERSECT(a, b, n)\
198 {\
199 	register bpf_u_int32 *_x = a, *_y = b;\
200 	register int _n = n;\
201 	while (--_n >= 0) *_x++ &= *_y++;\
202 }
203 
204 /*
205  * a := a - b
206  */
207 #define SET_SUBTRACT(a, b, n)\
208 {\
209 	register bpf_u_int32 *_x = a, *_y = b;\
210 	register int _n = n;\
211 	while (--_n >= 0) *_x++ &=~ *_y++;\
212 }
213 
214 /*
215  * a := a union b
216  */
217 #define SET_UNION(a, b, n)\
218 {\
219 	register bpf_u_int32 *_x = a, *_y = b;\
220 	register int _n = n;\
221 	while (--_n >= 0) *_x++ |= *_y++;\
222 }
223 
224 static __thread uset all_dom_sets;
225 static __thread uset all_closure_sets;
226 static __thread uset all_edge_sets;
227 
228 #ifndef MAX
229 #define MAX(a,b) ((a)>(b)?(a):(b))
230 #endif
231 
232 static void find_levels_r(b)
233      struct block *b;
234 {
235     int level;
236 
237     if (isMarked(b))
238         return;
239 
240     Mark(b);
241     b->link = 0;
242 
243     if (JT(b))
244     {
245         find_levels_r(JT(b));
246         find_levels_r(JF(b));
247         level = MAX(JT(b)->level, JF(b)->level) + 1;
248     }
249     else
250         level = 0;
251     b->level = level;
252     b->link = levels[level];
253     levels[level] = b;
254 }
255 
256 /*
257  * Level graph.  The levels go from 0 at the leaves to
258  * N_LEVELS at the root.  The levels[] array points to the
259  * first node of the level list, whose elements are linked
260  * with the 'link' field of the struct block.
261  */
262 static void find_levels(root)
263      struct block *root;
264 {
265     memset((char *) levels, 0, n_blocks * sizeof(*levels));
266     unMarkAll();
267     find_levels_r(root);
268 }
269 
270 /*
271  * Find dominator relationships.
272  * Assumes graph has been leveled.
273  */
274 static void find_dom(root)
275      struct block *root;
276 {
277     int i;
278     struct block *b;
279     bpf_u_int32 *x;
280 
281     /*
282      * Initialize sets to contain all nodes.
283      */
284     x = all_dom_sets;
285     i = n_blocks * nodewords;
286     while (--i >= 0)
287         *x++ = ~0;
288     /* Root starts off empty. */
289     for (i = nodewords; --i >= 0;)
290         root->dom[i] = 0;
291 
292     /* root->level is the highest level no found. */
293     for (i = root->level; i >= 0; --i)
294     {
295         for (b = levels[i]; b; b = b->link)
296         {
297             SET_INSERT(b->dom, b->id);
298             if (JT(b) == 0)
299                 continue;
300             SET_INTERSECT(JT(b)->dom, b->dom, nodewords);
301             SET_INTERSECT(JF(b)->dom, b->dom, nodewords);
302         }
303     }
304 }
305 
306 static void propedom(ep)
307      struct edge *ep;
308 {
309     SET_INSERT(ep->edom, ep->id);
310     if (ep->succ)
311     {
312         SET_INTERSECT(ep->succ->et.edom, ep->edom, edgewords);
313         SET_INTERSECT(ep->succ->ef.edom, ep->edom, edgewords);
314     }
315 }
316 
317 /*
318  * Compute edge dominators.
319  * Assumes graph has been leveled and predecessors established.
320  */
321 static void find_edom(root)
322      struct block *root;
323 {
324     int i;
325     uset x;
326     struct block *b;
327 
328     x = all_edge_sets;
329     for (i = n_edges * edgewords; --i >= 0;)
330         x[i] = ~0;
331 
332     /* root->level is the highest level no found. */
333     memset(root->et.edom, 0, edgewords * sizeof(*(uset) 0));
334     memset(root->ef.edom, 0, edgewords * sizeof(*(uset) 0));
335     for (i = root->level; i >= 0; --i)
336     {
337         for (b = levels[i]; b != 0; b = b->link)
338         {
339             propedom(&b->et);
340             propedom(&b->ef);
341         }
342     }
343 }
344 
345 /*
346  * Find the backwards transitive closure of the flow graph.  These sets
347  * are backwards in the sense that we find the set of nodes that reach
348  * a given node, not the set of nodes that can be reached by a node.
349  *
350  * Assumes graph has been leveled.
351  */
352 static void find_closure(root)
353      struct block *root;
354 {
355     int i;
356     struct block *b;
357 
358     /*
359      * Initialize sets to contain no nodes.
360      */
361     memset((char *) all_closure_sets, 0, n_blocks * nodewords * sizeof(*all_closure_sets));
362 
363     /* root->level is the highest level no found. */
364     for (i = root->level; i >= 0; --i)
365     {
366         for (b = levels[i]; b; b = b->link)
367         {
368             SET_INSERT(b->closure, b->id);
369             if (JT(b) == 0)
370                 continue;
371             SET_UNION(JT(b)->closure, b->closure, nodewords);
372             SET_UNION(JF(b)->closure, b->closure, nodewords);
373         }
374     }
375 }
376 
377 /*
378  * Return the register number that is used by s.  If A and X are both
379  * used, return AX_ATOM.  If no register is used, return -1.
380  *
381  * The implementation should probably change to an array access.
382  */
383 static int atomuse(s)
384      struct stmt *s;
385 {
386     register int c = s->code;
387 
388     if (c == NOP)
389         return -1;
390 
391     switch (BPF_CLASS(c))
392     {
393 
394         case BPF_RET:
395             return (BPF_RVAL(c) == BPF_A) ? A_ATOM : (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
396 
397         case BPF_LD:
398         case BPF_LDX:
399             return (BPF_MODE(c) == BPF_IND) ? X_ATOM : (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
400 
401         case BPF_ST:
402             return A_ATOM;
403 
404         case BPF_STX:
405             return X_ATOM;
406 
407         case BPF_JMP:
408         case BPF_ALU:
409             if (BPF_SRC(c) == BPF_X)
410                 return AX_ATOM;
411             return A_ATOM;
412 
413         case BPF_MISC:
414             return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
415     }
416     abort();
417     /* NOTREACHED */
418 }
419 
420 /*
421  * Return the register number that is defined by 's'.  We assume that
422  * a single stmt cannot define more than one register.  If no register
423  * is defined, return -1.
424  *
425  * The implementation should probably change to an array access.
426  */
427 static int atomdef(s)
428      struct stmt *s;
429 {
430     if (s->code == NOP)
431         return -1;
432 
433     switch (BPF_CLASS(s->code))
434     {
435 
436         case BPF_LD:
437         case BPF_ALU:
438             return A_ATOM;
439 
440         case BPF_LDX:
441             return X_ATOM;
442 
443         case BPF_ST:
444         case BPF_STX:
445             return s->k;
446 
447         case BPF_MISC:
448             return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
449     }
450     return -1;
451 }
452 
453 /*
454  * Compute the sets of registers used, defined, and killed by 'b'.
455  *
456  * "Used" means that a statement in 'b' uses the register before any
457  * statement in 'b' defines it, i.e. it uses the value left in
458  * that register by a predecessor block of this block.
459  * "Defined" means that a statement in 'b' defines it.
460  * "Killed" means that a statement in 'b' defines it before any
461  * statement in 'b' uses it, i.e. it kills the value left in that
462  * register by a predecessor block of this block.
463  */
464 static void compute_local_ud(b)
465      struct block *b;
466 {
467     struct slist *s;
468     atomset def = 0, use = 0, kill = 0;
469     int atom;
470 
471     for (s = b->stmts; s; s = s->next)
472     {
473         if (s->s.code == NOP)
474             continue;
475         atom = atomuse(&s->s);
476         if (atom >= 0)
477         {
478             if (atom == AX_ATOM)
479             {
480                 if (!ATOMELEM(def, X_ATOM))
481                     use |= ATOMMASK(X_ATOM);
482                 if (!ATOMELEM(def, A_ATOM))
483                     use |= ATOMMASK(A_ATOM);
484             }
485             else if (atom < N_ATOMS)
486             {
487                 if (!ATOMELEM(def, atom))
488                     use |= ATOMMASK(atom);
489             }
490             else
491                 abort();
492         }
493         atom = atomdef(&s->s);
494         if (atom >= 0)
495         {
496             if (!ATOMELEM(use, atom))
497                 kill |= ATOMMASK(atom);
498             def |= ATOMMASK(atom);
499         }
500     }
501     if (BPF_CLASS(b->s.code) == BPF_JMP)
502     {
503         /*
504          * XXX - what about RET?
505          */
506         atom = atomuse(&b->s);
507         if (atom >= 0)
508         {
509             if (atom == AX_ATOM)
510             {
511                 if (!ATOMELEM(def, X_ATOM))
512                     use |= ATOMMASK(X_ATOM);
513                 if (!ATOMELEM(def, A_ATOM))
514                     use |= ATOMMASK(A_ATOM);
515             }
516             else if (atom < N_ATOMS)
517             {
518                 if (!ATOMELEM(def, atom))
519                     use |= ATOMMASK(atom);
520             }
521             else
522                 abort();
523         }
524     }
525 
526     b->def = def;
527     b->kill = kill;
528     b->in_use = use;
529 }
530 
531 /*
532  * Assume graph is already leveled.
533  */
534 static void find_ud(root)
535      struct block *root;
536 {
537     int i, maxlevel;
538     struct block *p;
539 
540     /*
541      * root->level is the highest level no found;
542      * count down from there.
543      */
544     maxlevel = root->level;
545     for (i = maxlevel; i >= 0; --i)
546         for (p = levels[i]; p; p = p->link)
547         {
548             compute_local_ud(p);
549             p->out_use = 0;
550         }
551 
552     for (i = 1; i <= maxlevel; ++i)
553     {
554         for (p = levels[i]; p; p = p->link)
555         {
556             p->out_use |= JT(p)->in_use | JF(p)->in_use;
557             p->in_use |= p->out_use & ~p->kill;
558         }
559     }
560 }
561 
562 /*
563  * These data structures are used in a Cocke and Shwarz style
564  * value numbering scheme.  Since the flowgraph is acyclic,
565  * exit values can be propagated from a node's predecessors
566  * provided it is uniquely defined.
567  */
568 struct valnode
569 {
570     int code;
571     int v0, v1;
572     int val;
573     struct valnode *next;
574 };
575 
576 #define MODULUS 213
577 static __thread struct valnode *hashtbl[MODULUS];
578 static __thread int curval;
579 static __thread int maxval;
580 
581 /* Integer constants mapped with the load immediate opcode. */
582 #define K(i) F(BPF_LD|BPF_IMM|BPF_W, i, 0L)
583 
584 struct vmapinfo
585 {
586     int is_const;
587     bpf_int32 const_val;
588 };
589 
590 __thread struct vmapinfo *vmap;
591 __thread struct valnode *vnode_base;
592 __thread struct valnode *next_vnode;
593 
594 static void init_val()
595 {
596     curval = 0;
597     next_vnode = vnode_base;
598     memset((char *) vmap, 0, maxval * sizeof(*vmap));
599     memset((char *) hashtbl, 0, sizeof hashtbl);
600 }
601 
602 /* Because we really don't have an IR, this stuff is a little messy. */
603 static int F(code, v0, v1)
604      int code;
605      int v0, v1;
606 {
607     u_int hash;
608     int val;
609     struct valnode *p;
610 
611     hash = (u_int) code ^ (v0 << 4) ^ (v1 << 8);
612     hash %= MODULUS;
613 
614     for (p = hashtbl[hash]; p; p = p->next)
615         if (p->code == code && p->v0 == v0 && p->v1 == v1)
616             return p->val;
617 
618     val = ++curval;
619     if (BPF_MODE(code) == BPF_IMM && (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX))
620     {
621         vmap[val].const_val = v0;
622         vmap[val].is_const = 1;
623     }
624     p = next_vnode++;
625     p->val = val;
626     p->code = code;
627     p->v0 = v0;
628     p->v1 = v1;
629     p->next = hashtbl[hash];
630     hashtbl[hash] = p;
631 
632     return val;
633 }
634 
635 static inline void vstore(s, valp, newval, alter)
636      struct stmt *s;
637      int *valp;
638      int newval;
639      int alter;
640 {
641     if (alter && *valp == newval)
642         s->code = NOP;
643     else
644         *valp = newval;
645 }
646 
647 static void fold_op(s, v0, v1)
648      struct stmt *s;
649      int v0, v1;
650 {
651     bpf_u_int32 a, b;
652 
653     a = vmap[v0].const_val;
654     b = vmap[v1].const_val;
655 
656     switch (BPF_OP(s->code))
657     {
658         case BPF_ADD:
659             a += b;
660             break;
661 
662         case BPF_SUB:
663             a -= b;
664             break;
665 
666         case BPF_MUL:
667             a *= b;
668             break;
669 
670         case BPF_DIV:
671             if (b == 0)
672                 bpf_error("division by zero");
673             a /= b;
674             break;
675 
676         case BPF_AND:
677             a &= b;
678             break;
679 
680         case BPF_OR:
681             a |= b;
682             break;
683 
684         case BPF_LSH:
685             a <<= b;
686             break;
687 
688         case BPF_RSH:
689             a >>= b;
690             break;
691 
692         case BPF_NEG:
693             a = -a;
694             break;
695 
696         default:
697             abort();
698     }
699     s->k = a;
700     s->code = BPF_LD | BPF_IMM;
701     done = 0;
702 }
703 
704 static inline struct slist *this_op(s)
705      struct slist *s;
706 {
707     while (s != 0 && s->s.code == NOP)
708         s = s->next;
709     return s;
710 }
711 
712 static void opt_not(b)
713      struct block *b;
714 {
715     struct block *tmp = JT(b);
716 
717     JT(b) = JF(b);
718     JF(b) = tmp;
719 }
720 
721 static void opt_peep(b)
722      struct block *b;
723 {
724     struct slist *s;
725     struct slist *next, *last;
726     int val;
727 
728     s = b->stmts;
729     if (s == 0)
730         return;
731 
732     last = s;
733     for ( /*empty */ ; /*empty */ ; s = next)
734     {
735         /*
736          * Skip over nops.
737          */
738         s = this_op(s);
739         if (s == 0)
740             break;              /* nothing left in the block */
741 
742         /*
743          * Find the next real instruction after that one
744          * (skipping nops).
745          */
746         next = this_op(s->next);
747         if (next == 0)
748             break;              /* no next instruction */
749         last = next;
750 
751         /*
752          * st  M[k] --> st  M[k]
753          * ldx M[k]     tax
754          */
755         if (s->s.code == BPF_ST && next->s.code == (BPF_LDX | BPF_MEM) && s->s.k == next->s.k)
756         {
757             done = 0;
758             next->s.code = BPF_MISC | BPF_TAX;
759         }
760         /*
761          * ld  #k   --> ldx  #k
762          * tax          txa
763          */
764         if (s->s.code == (BPF_LD | BPF_IMM) && next->s.code == (BPF_MISC | BPF_TAX))
765         {
766             s->s.code = BPF_LDX | BPF_IMM;
767             next->s.code = BPF_MISC | BPF_TXA;
768             done = 0;
769         }
770         /*
771          * This is an ugly special case, but it happens
772          * when you say tcp[k] or udp[k] where k is a constant.
773          */
774         if (s->s.code == (BPF_LD | BPF_IMM))
775         {
776             struct slist *add, *tax, *ild;
777 
778             /*
779              * Check that X isn't used on exit from this
780              * block (which the optimizer might cause).
781              * We know the code generator won't generate
782              * any local dependencies.
783              */
784             if (ATOMELEM(b->out_use, X_ATOM))
785                 continue;
786 
787             /*
788              * Check that the instruction following the ldi
789              * is an addx, or it's an ldxms with an addx
790              * following it (with 0 or more nops between the
791              * ldxms and addx).
792              */
793             if (next->s.code != (BPF_LDX | BPF_MSH | BPF_B))
794                 add = next;
795             else
796                 add = this_op(next->next);
797             if (add == 0 || add->s.code != (BPF_ALU | BPF_ADD | BPF_X))
798                 continue;
799 
800             /*
801              * Check that a tax follows that (with 0 or more
802              * nops between them).
803              */
804             tax = this_op(add->next);
805             if (tax == 0 || tax->s.code != (BPF_MISC | BPF_TAX))
806                 continue;
807 
808             /*
809              * Check that an ild follows that (with 0 or more
810              * nops between them).
811              */
812             ild = this_op(tax->next);
813             if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD || BPF_MODE(ild->s.code) != BPF_IND)
814                 continue;
815             /*
816              * We want to turn this sequence:
817              *
818              * (004) ldi     #0x2       {s}
819              * (005) ldxms   [14]       {next}  -- optional
820              * (006) addx           {add}
821              * (007) tax            {tax}
822              * (008) ild     [x+0]      {ild}
823              *
824              * into this sequence:
825              *
826              * (004) nop
827              * (005) ldxms   [14]
828              * (006) nop
829              * (007) nop
830              * (008) ild     [x+2]
831              *
832              * XXX We need to check that X is not
833              * subsequently used, because we want to change
834              * what'll be in it after this sequence.
835              *
836              * We know we can eliminate the accumulator
837              * modifications earlier in the sequence since
838              * it is defined by the last stmt of this sequence
839              * (i.e., the last statement of the sequence loads
840              * a value into the accumulator, so we can eliminate
841              * earlier operations on the accumulator).
842              */
843             ild->s.k += s->s.k;
844             s->s.code = NOP;
845             add->s.code = NOP;
846             tax->s.code = NOP;
847             done = 0;
848         }
849     }
850     /*
851      * If the comparison at the end of a block is an equality
852      * comparison against a constant, and nobody uses the value
853      * we leave in the A register at the end of a block, and
854      * the operation preceding the comparison is an arithmetic
855      * operation, we can sometime optimize it away.
856      */
857     if (b->s.code == (BPF_JMP | BPF_JEQ | BPF_K) && !ATOMELEM(b->out_use, A_ATOM))
858     {
859         /*
860          * We can optimize away certain subtractions of the
861          * X register.
862          */
863         if (last->s.code == (BPF_ALU | BPF_SUB | BPF_X))
864         {
865             val = b->val[X_ATOM];
866             if (vmap[val].is_const)
867             {
868                 /*
869                  * If we have a subtract to do a comparison,
870                  * and the X register is a known constant,
871                  * we can merge this value into the
872                  * comparison:
873                  *
874                  * sub x  ->    nop
875                  * jeq #y   jeq #(x+y)
876                  */
877                 b->s.k += vmap[val].const_val;
878                 last->s.code = NOP;
879                 done = 0;
880             }
881             else if (b->s.k == 0)
882             {
883                 /*
884                  * If the X register isn't a constant,
885                  * and the comparison in the test is
886                  * against 0, we can compare with the
887                  * X register, instead:
888                  *
889                  * sub x  ->    nop
890                  * jeq #0   jeq x
891                  */
892                 last->s.code = NOP;
893                 b->s.code = BPF_JMP | BPF_JEQ | BPF_X;
894                 done = 0;
895             }
896         }
897         /*
898          * Likewise, a constant subtract can be simplified:
899          *
900          * sub #x ->    nop
901          * jeq #y ->    jeq #(x+y)
902          */
903         else if (last->s.code == (BPF_ALU | BPF_SUB | BPF_K))
904         {
905             last->s.code = NOP;
906             b->s.k += last->s.k;
907             done = 0;
908         }
909         /*
910          * And, similarly, a constant AND can be simplified
911          * if we're testing against 0, i.e.:
912          *
913          * and #k   nop
914          * jeq #0  ->   jset #k
915          */
916         else if (last->s.code == (BPF_ALU | BPF_AND | BPF_K) && b->s.k == 0)
917         {
918             b->s.k = last->s.k;
919             b->s.code = BPF_JMP | BPF_K | BPF_JSET;
920             last->s.code = NOP;
921             done = 0;
922             opt_not(b);
923         }
924     }
925     /*
926      * jset #0        ->   never
927      * jset #ffffffff ->   always
928      */
929     if (b->s.code == (BPF_JMP | BPF_K | BPF_JSET))
930     {
931         if (b->s.k == 0)
932             JT(b) = JF(b);
933         if (b->s.k == 0xffffffff)
934             JF(b) = JT(b);
935     }
936     /*
937      * If we're comparing against the index register, and the index
938      * register is a known constant, we can just compare against that
939      * constant.
940      */
941     val = b->val[X_ATOM];
942     if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X)
943     {
944         bpf_int32 v = vmap[val].const_val;
945         b->s.code &= ~BPF_X;
946         b->s.k = v;
947     }
948     /*
949      * If the accumulator is a known constant, we can compute the
950      * comparison result.
951      */
952     val = b->val[A_ATOM];
953     if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K)
954     {
955         bpf_int32 v = vmap[val].const_val;
956         switch (BPF_OP(b->s.code))
957         {
958 
959             case BPF_JEQ:
960                 v = v == b->s.k;
961                 break;
962 
963             case BPF_JGT:
964                 v = (unsigned) v > b->s.k;
965                 break;
966 
967             case BPF_JGE:
968                 v = (unsigned) v >= b->s.k;
969                 break;
970 
971             case BPF_JSET:
972                 v &= b->s.k;
973                 break;
974 
975             default:
976                 abort();
977         }
978         if (JF(b) != JT(b))
979             done = 0;
980         if (v)
981             JF(b) = JT(b);
982         else
983             JT(b) = JF(b);
984     }
985 }
986 
987 /*
988  * Compute the symbolic value of expression of 's', and update
989  * anything it defines in the value table 'val'.  If 'alter' is true,
990  * do various optimizations.  This code would be cleaner if symbolic
991  * evaluation and code transformations weren't folded together.
992  */
993 static void opt_stmt(s, val, alter)
994      struct stmt *s;
995      int val[];
996      int alter;
997 {
998     int op;
999     int v;
1000 
1001     switch (s->code)
1002     {
1003 
1004         case BPF_LD | BPF_ABS | BPF_W:
1005         case BPF_LD | BPF_ABS | BPF_H:
1006         case BPF_LD | BPF_ABS | BPF_B:
1007             v = F(s->code, s->k, 0L);
1008             vstore(s, &val[A_ATOM], v, alter);
1009             break;
1010 
1011         case BPF_LD | BPF_IND | BPF_W:
1012         case BPF_LD | BPF_IND | BPF_H:
1013         case BPF_LD | BPF_IND | BPF_B:
1014             v = val[X_ATOM];
1015             if (alter && vmap[v].is_const)
1016             {
1017                 s->code = BPF_LD | BPF_ABS | BPF_SIZE(s->code);
1018                 s->k += vmap[v].const_val;
1019                 v = F(s->code, s->k, 0L);
1020                 done = 0;
1021             }
1022             else
1023                 v = F(s->code, s->k, v);
1024             vstore(s, &val[A_ATOM], v, alter);
1025             break;
1026 
1027         case BPF_LD | BPF_LEN:
1028             v = F(s->code, 0L, 0L);
1029             vstore(s, &val[A_ATOM], v, alter);
1030             break;
1031 
1032         case BPF_LD | BPF_IMM:
1033             v = K(s->k);
1034             vstore(s, &val[A_ATOM], v, alter);
1035             break;
1036 
1037         case BPF_LDX | BPF_IMM:
1038             v = K(s->k);
1039             vstore(s, &val[X_ATOM], v, alter);
1040             break;
1041 
1042         case BPF_LDX | BPF_MSH | BPF_B:
1043             v = F(s->code, s->k, 0L);
1044             vstore(s, &val[X_ATOM], v, alter);
1045             break;
1046 
1047         case BPF_ALU | BPF_NEG:
1048             if (alter && vmap[val[A_ATOM]].is_const)
1049             {
1050                 s->code = BPF_LD | BPF_IMM;
1051                 s->k = -vmap[val[A_ATOM]].const_val;
1052                 val[A_ATOM] = K(s->k);
1053             }
1054             else
1055                 val[A_ATOM] = F(s->code, val[A_ATOM], 0L);
1056             break;
1057 
1058         case BPF_ALU | BPF_ADD | BPF_K:
1059         case BPF_ALU | BPF_SUB | BPF_K:
1060         case BPF_ALU | BPF_MUL | BPF_K:
1061         case BPF_ALU | BPF_DIV | BPF_K:
1062         case BPF_ALU | BPF_AND | BPF_K:
1063         case BPF_ALU | BPF_OR | BPF_K:
1064         case BPF_ALU | BPF_LSH | BPF_K:
1065         case BPF_ALU | BPF_RSH | BPF_K:
1066             op = BPF_OP(s->code);
1067             if (alter)
1068             {
1069                 if (s->k == 0)
1070                 {
1071                     /* don't optimize away "sub #0"
1072                      * as it may be needed later to
1073                      * fixup the generated math code */
1074                     if (op == BPF_ADD || op == BPF_LSH || op == BPF_RSH || op == BPF_OR)
1075                     {
1076                         s->code = NOP;
1077                         break;
1078                     }
1079                     if (op == BPF_MUL || op == BPF_AND)
1080                     {
1081                         s->code = BPF_LD | BPF_IMM;
1082                         val[A_ATOM] = K(s->k);
1083                         break;
1084                     }
1085                 }
1086                 if (vmap[val[A_ATOM]].is_const)
1087                 {
1088                     fold_op(s, val[A_ATOM], K(s->k));
1089                     val[A_ATOM] = K(s->k);
1090                     break;
1091                 }
1092             }
1093             val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k));
1094             break;
1095 
1096         case BPF_ALU | BPF_ADD | BPF_X:
1097         case BPF_ALU | BPF_SUB | BPF_X:
1098         case BPF_ALU | BPF_MUL | BPF_X:
1099         case BPF_ALU | BPF_DIV | BPF_X:
1100         case BPF_ALU | BPF_AND | BPF_X:
1101         case BPF_ALU | BPF_OR | BPF_X:
1102         case BPF_ALU | BPF_LSH | BPF_X:
1103         case BPF_ALU | BPF_RSH | BPF_X:
1104             op = BPF_OP(s->code);
1105             if (alter && vmap[val[X_ATOM]].is_const)
1106             {
1107                 if (vmap[val[A_ATOM]].is_const)
1108                 {
1109                     fold_op(s, val[A_ATOM], val[X_ATOM]);
1110                     val[A_ATOM] = K(s->k);
1111                 }
1112                 else
1113                 {
1114                     s->code = BPF_ALU | BPF_K | op;
1115                     s->k = vmap[val[X_ATOM]].const_val;
1116                     done = 0;
1117                     val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k));
1118                 }
1119                 break;
1120             }
1121             /*
1122              * Check if we're doing something to an accumulator
1123              * that is 0, and simplify.  This may not seem like
1124              * much of a simplification but it could open up further
1125              * optimizations.
1126              * XXX We could also check for mul by 1, etc.
1127              */
1128             if (alter && vmap[val[A_ATOM]].is_const && vmap[val[A_ATOM]].const_val == 0)
1129             {
1130                 if (op == BPF_ADD || op == BPF_OR)
1131                 {
1132                     s->code = BPF_MISC | BPF_TXA;
1133                     vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1134                     break;
1135                 }
1136                 else if (op == BPF_MUL || op == BPF_DIV || op == BPF_AND || op == BPF_LSH || op == BPF_RSH)
1137                 {
1138                     s->code = BPF_LD | BPF_IMM;
1139                     s->k = 0;
1140                     vstore(s, &val[A_ATOM], K(s->k), alter);
1141                     break;
1142                 }
1143                 else if (op == BPF_NEG)
1144                 {
1145                     s->code = NOP;
1146                     break;
1147                 }
1148             }
1149             val[A_ATOM] = F(s->code, val[A_ATOM], val[X_ATOM]);
1150             break;
1151 
1152         case BPF_MISC | BPF_TXA:
1153             vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1154             break;
1155 
1156         case BPF_LD | BPF_MEM:
1157             v = val[s->k];
1158             if (alter && vmap[v].is_const)
1159             {
1160                 s->code = BPF_LD | BPF_IMM;
1161                 s->k = vmap[v].const_val;
1162                 done = 0;
1163             }
1164             vstore(s, &val[A_ATOM], v, alter);
1165             break;
1166 
1167         case BPF_MISC | BPF_TAX:
1168             vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1169             break;
1170 
1171         case BPF_LDX | BPF_MEM:
1172             v = val[s->k];
1173             if (alter && vmap[v].is_const)
1174             {
1175                 s->code = BPF_LDX | BPF_IMM;
1176                 s->k = vmap[v].const_val;
1177                 done = 0;
1178             }
1179             vstore(s, &val[X_ATOM], v, alter);
1180             break;
1181 
1182         case BPF_ST:
1183             vstore(s, &val[s->k], val[A_ATOM], alter);
1184             break;
1185 
1186         case BPF_STX:
1187             vstore(s, &val[s->k], val[X_ATOM], alter);
1188             break;
1189     }
1190 }
1191 
1192 static void deadstmt(s, last)
1193      register struct stmt *s;
1194      register struct stmt *last[];
1195 {
1196     register int atom;
1197 
1198     atom = atomuse(s);
1199     if (atom >= 0)
1200     {
1201         if (atom == AX_ATOM)
1202         {
1203             last[X_ATOM] = 0;
1204             last[A_ATOM] = 0;
1205         }
1206         else
1207             last[atom] = 0;
1208     }
1209     atom = atomdef(s);
1210     if (atom >= 0)
1211     {
1212         if (last[atom])
1213         {
1214             done = 0;
1215             last[atom]->code = NOP;
1216         }
1217         last[atom] = s;
1218     }
1219 }
1220 
1221 static void opt_deadstores(b)
1222      register struct block *b;
1223 {
1224     register struct slist *s;
1225     register int atom;
1226     struct stmt *last[N_ATOMS];
1227 
1228     memset((char *) last, 0, sizeof last);
1229 
1230     for (s = b->stmts; s != 0; s = s->next)
1231         deadstmt(&s->s, last);
1232     deadstmt(&b->s, last);
1233 
1234     for (atom = 0; atom < N_ATOMS; ++atom)
1235         if (last[atom] && !ATOMELEM(b->out_use, atom))
1236         {
1237             last[atom]->code = NOP;
1238             done = 0;
1239         }
1240 }
1241 
1242 static void opt_blk(b, do_stmts)
1243      struct block *b;
1244      int do_stmts;
1245 {
1246     struct slist *s;
1247     struct edge *p;
1248     int i;
1249     bpf_int32 aval, xval;
1250 
1251 #if YYDEBUG
1252     for (s = b->stmts; s && s->next; s = s->next)
1253         if (BPF_CLASS(s->s.code) == BPF_JMP)
1254         {
1255             do_stmts = 0;
1256             break;
1257         }
1258 #endif
1259 
1260     /*
1261      * Initialize the atom values.
1262      */
1263     p = b->in_edges;
1264     if (p == 0)
1265     {
1266         /*
1267          * We have no predecessors, so everything is undefined
1268          * upon entry to this block.
1269          */
1270         memset((char *) b->val, 0, sizeof(b->val));
1271     }
1272     else
1273     {
1274         /*
1275          * Inherit values from our predecessors.
1276          *
1277          * First, get the values from the predecessor along the
1278          * first edge leading to this node.
1279          */
1280         memcpy((char *) b->val, (char *) p->pred->val, sizeof(b->val));
1281         /*
1282          * Now look at all the other nodes leading to this node.
1283          * If, for the predecessor along that edge, a register
1284          * has a different value from the one we have (i.e.,
1285          * control paths are merging, and the merging paths
1286          * assign different values to that register), give the
1287          * register the undefined value of 0.
1288          */
1289         while ((p = p->next) != NULL)
1290         {
1291             for (i = 0; i < N_ATOMS; ++i)
1292                 if (b->val[i] != p->pred->val[i])
1293                     b->val[i] = 0;
1294         }
1295     }
1296     aval = b->val[A_ATOM];
1297     xval = b->val[X_ATOM];
1298     for (s = b->stmts; s; s = s->next)
1299         opt_stmt(&s->s, b->val, do_stmts);
1300 
1301     /*
1302      * This is a special case: if we don't use anything from this
1303      * block, and we load the accumulator or index register with a
1304      * value that is already there, or if this block is a return,
1305      * eliminate all the statements.
1306      *
1307      * XXX - what if it does a store?
1308      *
1309      * XXX - why does it matter whether we use anything from this
1310      * block?  If the accumulator or index register doesn't change
1311      * its value, isn't that OK even if we use that value?
1312      *
1313      * XXX - if we load the accumulator with a different value,
1314      * and the block ends with a conditional branch, we obviously
1315      * can't eliminate it, as the branch depends on that value.
1316      * For the index register, the conditional branch only depends
1317      * on the index register value if the test is against the index
1318      * register value rather than a constant; if nothing uses the
1319      * value we put into the index register, and we're not testing
1320      * against the index register's value, and there aren't any
1321      * other problems that would keep us from eliminating this
1322      * block, can we eliminate it?
1323      */
1324     if (do_stmts &&
1325         ((b->out_use == 0 && aval != 0 && b->val[A_ATOM] == aval &&
1326           xval != 0 && b->val[X_ATOM] == xval) || BPF_CLASS(b->s.code) == BPF_RET))
1327     {
1328         if (b->stmts != 0)
1329         {
1330             b->stmts = 0;
1331             done = 0;
1332         }
1333     }
1334     else
1335     {
1336         opt_peep(b);
1337         opt_deadstores(b);
1338     }
1339     /*
1340      * Set up values for branch optimizer.
1341      */
1342     if (BPF_SRC(b->s.code) == BPF_K)
1343         b->oval = K(b->s.k);
1344     else
1345         b->oval = b->val[X_ATOM];
1346     b->et.code = b->s.code;
1347     b->ef.code = -b->s.code;
1348 }
1349 
1350 /*
1351  * Return true if any register that is used on exit from 'succ', has
1352  * an exit value that is different from the corresponding exit value
1353  * from 'b'.
1354  */
1355 static int use_conflict(b, succ)
1356      struct block *b, *succ;
1357 {
1358     int atom;
1359     atomset use = succ->out_use;
1360 
1361     if (use == 0)
1362         return 0;
1363 
1364     for (atom = 0; atom < N_ATOMS; ++atom)
1365         if (ATOMELEM(use, atom))
1366             if (b->val[atom] != succ->val[atom])
1367                 return 1;
1368     return 0;
1369 }
1370 
1371 static struct block *fold_edge(child, ep)
1372      struct block *child;
1373      struct edge *ep;
1374 {
1375     int sense;
1376     int aval0, aval1, oval0, oval1;
1377     int code = ep->code;
1378 
1379     if (code < 0)
1380     {
1381         code = -code;
1382         sense = 0;
1383     }
1384     else
1385         sense = 1;
1386 
1387     if (child->s.code != code)
1388         return 0;
1389 
1390     aval0 = child->val[A_ATOM];
1391     oval0 = child->oval;
1392     aval1 = ep->pred->val[A_ATOM];
1393     oval1 = ep->pred->oval;
1394 
1395     if (aval0 != aval1)
1396         return 0;
1397 
1398     if (oval0 == oval1)
1399         /*
1400          * The operands of the branch instructions are
1401          * identical, so the result is true if a true
1402          * branch was taken to get here, otherwise false.
1403          */
1404         return sense ? JT(child) : JF(child);
1405 
1406     if (sense && code == (BPF_JMP | BPF_JEQ | BPF_K))
1407         /*
1408          * At this point, we only know the comparison if we
1409          * came down the true branch, and it was an equality
1410          * comparison with a constant.
1411          *
1412          * I.e., if we came down the true branch, and the branch
1413          * was an equality comparison with a constant, we know the
1414          * accumulator contains that constant.  If we came down
1415          * the false branch, or the comparison wasn't with a
1416          * constant, we don't know what was in the accumulator.
1417          *
1418          * We rely on the fact that distinct constants have distinct
1419          * value numbers.
1420          */
1421         return JF(child);
1422 
1423     return 0;
1424 }
1425 
1426 static void opt_j(ep)
1427      struct edge *ep;
1428 {
1429     register int i, k;
1430     register struct block *target;
1431 
1432     if (JT(ep->succ) == 0)
1433         return;
1434 
1435     if (JT(ep->succ) == JF(ep->succ))
1436     {
1437         /*
1438          * Common branch targets can be eliminated, provided
1439          * there is no data dependency.
1440          */
1441         if (!use_conflict(ep->pred, ep->succ->et.succ))
1442         {
1443             done = 0;
1444             ep->succ = JT(ep->succ);
1445         }
1446     }
1447     /*
1448      * For each edge dominator that matches the successor of this
1449      * edge, promote the edge successor to the its grandchild.
1450      *
1451      * XXX We violate the set abstraction here in favor a reasonably
1452      * efficient loop.
1453      */
1454   top:
1455     for (i = 0; i < edgewords; ++i)
1456     {
1457         register bpf_u_int32 x = ep->edom[i];
1458 
1459         while (x != 0)
1460         {
1461             k = ffs(x) - 1;
1462             x &= ~(1 << k);
1463             k += i * BITS_PER_WORD;
1464 
1465             target = fold_edge(ep->succ, edges[k]);
1466             /*
1467              * Check that there is no data dependency between
1468              * nodes that will be violated if we move the edge.
1469              */
1470             if (target != 0 && !use_conflict(ep->pred, target))
1471             {
1472                 done = 0;
1473                 ep->succ = target;
1474                 if (JT(target) != 0)
1475                     /*
1476                      * Start over unless we hit a leaf.
1477                      */
1478                     goto top;
1479                 return;
1480             }
1481         }
1482     }
1483 }
1484 
1485 
1486 static void or_pullup(b)
1487      struct block *b;
1488 {
1489     int val, at_top;
1490     struct block *pull;
1491     struct block **diffp, **samep;
1492     struct edge *ep;
1493 
1494     ep = b->in_edges;
1495     if (ep == 0)
1496         return;
1497 
1498     /*
1499      * Make sure each predecessor loads the same value.
1500      * XXX why?
1501      */
1502     val = ep->pred->val[A_ATOM];
1503     for (ep = ep->next; ep != 0; ep = ep->next)
1504         if (val != ep->pred->val[A_ATOM])
1505             return;
1506 
1507     if (JT(b->in_edges->pred) == b)
1508         diffp = &JT(b->in_edges->pred);
1509     else
1510         diffp = &JF(b->in_edges->pred);
1511 
1512     at_top = 1;
1513     while (1)
1514     {
1515         if (*diffp == 0)
1516             return;
1517 
1518         if (JT(*diffp) != JT(b))
1519             return;
1520 
1521         if (!SET_MEMBER((*diffp)->dom, b->id))
1522             return;
1523 
1524         if ((*diffp)->val[A_ATOM] != val)
1525             break;
1526 
1527         diffp = &JF(*diffp);
1528         at_top = 0;
1529     }
1530     samep = &JF(*diffp);
1531     while (1)
1532     {
1533         if (*samep == 0)
1534             return;
1535 
1536         if (JT(*samep) != JT(b))
1537             return;
1538 
1539         if (!SET_MEMBER((*samep)->dom, b->id))
1540             return;
1541 
1542         if ((*samep)->val[A_ATOM] == val)
1543             break;
1544 
1545         /* XXX Need to check that there are no data dependencies
1546            between dp0 and dp1.  Currently, the code generator
1547            will not produce such dependencies. */
1548         samep = &JF(*samep);
1549     }
1550 #ifdef notdef
1551     /* XXX This doesn't cover everything. */
1552     for (i = 0; i < N_ATOMS; ++i)
1553         if ((*samep)->val[i] != pred->val[i])
1554             return;
1555 #endif
1556     /* Pull up the node. */
1557     pull = *samep;
1558     *samep = JF(pull);
1559     JF(pull) = *diffp;
1560 
1561     /*
1562      * At the top of the chain, each predecessor needs to point at the
1563      * pulled up node.  Inside the chain, there is only one predecessor
1564      * to worry about.
1565      */
1566     if (at_top)
1567     {
1568         for (ep = b->in_edges; ep != 0; ep = ep->next)
1569         {
1570             if (JT(ep->pred) == b)
1571                 JT(ep->pred) = pull;
1572             else
1573                 JF(ep->pred) = pull;
1574         }
1575     }
1576     else
1577         *diffp = pull;
1578 
1579     done = 0;
1580 }
1581 
1582 static void and_pullup(b)
1583      struct block *b;
1584 {
1585     int val, at_top;
1586     struct block *pull;
1587     struct block **diffp, **samep;
1588     struct edge *ep;
1589 
1590     ep = b->in_edges;
1591     if (ep == 0)
1592         return;
1593 
1594     /*
1595      * Make sure each predecessor loads the same value.
1596      */
1597     val = ep->pred->val[A_ATOM];
1598     for (ep = ep->next; ep != 0; ep = ep->next)
1599         if (val != ep->pred->val[A_ATOM])
1600             return;
1601 
1602     if (JT(b->in_edges->pred) == b)
1603         diffp = &JT(b->in_edges->pred);
1604     else
1605         diffp = &JF(b->in_edges->pred);
1606 
1607     at_top = 1;
1608     while (1)
1609     {
1610         if (*diffp == 0)
1611             return;
1612 
1613         if (JF(*diffp) != JF(b))
1614             return;
1615 
1616         if (!SET_MEMBER((*diffp)->dom, b->id))
1617             return;
1618 
1619         if ((*diffp)->val[A_ATOM] != val)
1620             break;
1621 
1622         diffp = &JT(*diffp);
1623         at_top = 0;
1624     }
1625     samep = &JT(*diffp);
1626     while (1)
1627     {
1628         if (*samep == 0)
1629             return;
1630 
1631         if (JF(*samep) != JF(b))
1632             return;
1633 
1634         if (!SET_MEMBER((*samep)->dom, b->id))
1635             return;
1636 
1637         if ((*samep)->val[A_ATOM] == val)
1638             break;
1639 
1640         /* XXX Need to check that there are no data dependencies
1641            between diffp and samep.  Currently, the code generator
1642            will not produce such dependencies. */
1643         samep = &JT(*samep);
1644     }
1645 #ifdef notdef
1646     /* XXX This doesn't cover everything. */
1647     for (i = 0; i < N_ATOMS; ++i)
1648         if ((*samep)->val[i] != pred->val[i])
1649             return;
1650 #endif
1651     /* Pull up the node. */
1652     pull = *samep;
1653     *samep = JT(pull);
1654     JT(pull) = *diffp;
1655 
1656     /*
1657      * At the top of the chain, each predecessor needs to point at the
1658      * pulled up node.  Inside the chain, there is only one predecessor
1659      * to worry about.
1660      */
1661     if (at_top)
1662     {
1663         for (ep = b->in_edges; ep != 0; ep = ep->next)
1664         {
1665             if (JT(ep->pred) == b)
1666                 JT(ep->pred) = pull;
1667             else
1668                 JF(ep->pred) = pull;
1669         }
1670     }
1671     else
1672         *diffp = pull;
1673 
1674     done = 0;
1675 }
1676 
1677 static void opt_blks(root, do_stmts)
1678      struct block *root;
1679      int do_stmts;
1680 {
1681     int i, maxlevel;
1682     struct block *p;
1683 
1684     init_val();
1685     maxlevel = root->level;
1686 
1687     find_inedges(root);
1688     for (i = maxlevel; i >= 0; --i)
1689         for (p = levels[i]; p; p = p->link)
1690             opt_blk(p, do_stmts);
1691 
1692     if (do_stmts)
1693         /*
1694          * No point trying to move branches; it can't possibly
1695          * make a difference at this point.
1696          */
1697         return;
1698 
1699     for (i = 1; i <= maxlevel; ++i)
1700     {
1701         for (p = levels[i]; p; p = p->link)
1702         {
1703             opt_j(&p->et);
1704             opt_j(&p->ef);
1705         }
1706     }
1707 
1708     find_inedges(root);
1709     for (i = 1; i <= maxlevel; ++i)
1710     {
1711         for (p = levels[i]; p; p = p->link)
1712         {
1713             or_pullup(p);
1714             and_pullup(p);
1715         }
1716     }
1717 }
1718 
1719 static inline void link_inedge(parent, child)
1720      struct edge *parent;
1721      struct block *child;
1722 {
1723     parent->next = child->in_edges;
1724     child->in_edges = parent;
1725 }
1726 
1727 static void find_inedges(root)
1728      struct block *root;
1729 {
1730     int i;
1731     struct block *b;
1732 
1733     for (i = 0; i < n_blocks; ++i)
1734         blocks[i]->in_edges = 0;
1735 
1736     /*
1737      * Traverse the graph, adding each edge to the predecessor
1738      * list of its successors.  Skip the leaves (i.e. level 0).
1739      */
1740     for (i = root->level; i > 0; --i)
1741     {
1742         for (b = levels[i]; b != 0; b = b->link)
1743         {
1744             link_inedge(&b->et, JT(b));
1745             link_inedge(&b->ef, JF(b));
1746         }
1747     }
1748 }
1749 
1750 static void opt_root(b)
1751      struct block **b;
1752 {
1753     struct slist *tmp, *s;
1754 
1755     s = (*b)->stmts;
1756     (*b)->stmts = 0;
1757     while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1758         *b = JT(*b);
1759 
1760     tmp = (*b)->stmts;
1761     if (tmp != 0)
1762         sappend(s, tmp);
1763     (*b)->stmts = s;
1764 
1765     /*
1766      * If the root node is a return, then there is no
1767      * point executing any statements (since the bpf machine
1768      * has no side effects).
1769      */
1770     if (BPF_CLASS((*b)->s.code) == BPF_RET)
1771         (*b)->stmts = 0;
1772 }
1773 
1774 static void opt_loop(root, do_stmts)
1775      struct block *root;
1776      int do_stmts;
1777 {
1778 
1779 #ifdef BDEBUG
1780     if (dflag > 1)
1781     {
1782         printf("opt_loop(root, %d) begin\n", do_stmts);
1783         opt_dump(root);
1784     }
1785 #endif
1786     do
1787     {
1788         done = 1;
1789         find_levels(root);
1790         find_dom(root);
1791         find_closure(root);
1792         find_ud(root);
1793         find_edom(root);
1794         opt_blks(root, do_stmts);
1795 #ifdef BDEBUG
1796         if (dflag > 1)
1797         {
1798             printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, done);
1799             opt_dump(root);
1800         }
1801 #endif
1802     } while (!done);
1803 }
1804 
1805 /*
1806  * Optimize the filter code in its dag representation.
1807  */
1808 void bpf_optimize(rootp)
1809      struct block **rootp;
1810 {
1811     struct block *root;
1812 
1813     root = *rootp;
1814 
1815     opt_init(root);
1816     opt_loop(root, 0);
1817     opt_loop(root, 1);
1818     intern_blocks(root);
1819 #ifdef BDEBUG
1820     if (dflag > 1)
1821     {
1822         printf("after intern_blocks()\n");
1823         opt_dump(root);
1824     }
1825 #endif
1826     opt_root(rootp);
1827 #ifdef BDEBUG
1828     if (dflag > 1)
1829     {
1830         printf("after opt_root()\n");
1831         opt_dump(root);
1832     }
1833 #endif
1834     opt_cleanup();
1835 }
1836 
1837 static void make_marks(p)
1838      struct block *p;
1839 {
1840     if (!isMarked(p))
1841     {
1842         Mark(p);
1843         if (BPF_CLASS(p->s.code) != BPF_RET)
1844         {
1845             make_marks(JT(p));
1846             make_marks(JF(p));
1847         }
1848     }
1849 }
1850 
1851 /*
1852  * Mark code array such that isMarked(i) is true
1853  * only for nodes that are alive.
1854  */
1855 static void mark_code(p)
1856      struct block *p;
1857 {
1858     cur_mark += 1;
1859     make_marks(p);
1860 }
1861 
1862 /*
1863  * True iff the two stmt lists load the same value from the packet into
1864  * the accumulator.
1865  */
1866 static int eq_slist(x, y)
1867      struct slist *x, *y;
1868 {
1869     while (1)
1870     {
1871         while (x && x->s.code == NOP)
1872             x = x->next;
1873         while (y && y->s.code == NOP)
1874             y = y->next;
1875         if (x == 0)
1876             return y == 0;
1877         if (y == 0)
1878             return x == 0;
1879         if (x->s.code != y->s.code || x->s.k != y->s.k)
1880             return 0;
1881         x = x->next;
1882         y = y->next;
1883     }
1884 }
1885 
1886 static inline int eq_blk(b0, b1)
1887      struct block *b0, *b1;
1888 {
1889     if (b0->s.code == b1->s.code &&
1890         b0->s.k == b1->s.k && b0->et.succ == b1->et.succ && b0->ef.succ == b1->ef.succ)
1891         return eq_slist(b0->stmts, b1->stmts);
1892     return 0;
1893 }
1894 
1895 static void intern_blocks(root)
1896      struct block *root;
1897 {
1898     struct block *p;
1899     int i, j;
1900     int done1;                  /* don't shadow global */
1901   top:
1902     done1 = 1;
1903     for (i = 0; i < n_blocks; ++i)
1904         blocks[i]->link = 0;
1905 
1906     mark_code(root);
1907 
1908     for (i = n_blocks - 1; --i >= 0;)
1909     {
1910         if (!isMarked(blocks[i]))
1911             continue;
1912         for (j = i + 1; j < n_blocks; ++j)
1913         {
1914             if (!isMarked(blocks[j]))
1915                 continue;
1916             if (eq_blk(blocks[i], blocks[j]))
1917             {
1918                 blocks[i]->link = blocks[j]->link ? blocks[j]->link : blocks[j];
1919                 break;
1920             }
1921         }
1922     }
1923     for (i = 0; i < n_blocks; ++i)
1924     {
1925         p = blocks[i];
1926         if (JT(p) == 0)
1927             continue;
1928         if (JT(p)->link)
1929         {
1930             done1 = 0;
1931             JT(p) = JT(p)->link;
1932         }
1933         if (JF(p)->link)
1934         {
1935             done1 = 0;
1936             JF(p) = JF(p)->link;
1937         }
1938     }
1939     if (!done1)
1940         goto top;
1941 }
1942 
1943 static void opt_cleanup()
1944 {
1945     free((void *) vnode_base);
1946     free((void *) vmap);
1947     free((void *) edges);
1948     free((void *) space);
1949     free((void *) levels);
1950     free((void *) blocks);
1951 }
1952 
1953 /*
1954  * Return the number of stmts in 's'.
1955  */
1956 static int slength(s)
1957      struct slist *s;
1958 {
1959     int n = 0;
1960 
1961     for (; s; s = s->next)
1962         if (s->s.code != NOP)
1963             ++n;
1964     return n;
1965 }
1966 
1967 /*
1968  * Return the number of nodes reachable by 'p'.
1969  * All nodes should be initially unmarked.
1970  */
1971 static int count_blocks(p)
1972      struct block *p;
1973 {
1974     if (p == 0 || isMarked(p))
1975         return 0;
1976     Mark(p);
1977     return count_blocks(JT(p)) + count_blocks(JF(p)) + 1;
1978 }
1979 
1980 /*
1981  * Do a depth first search on the flow graph, numbering the
1982  * the basic blocks, and entering them into the 'blocks' array.`
1983  */
1984 static void number_blks_r(p)
1985      struct block *p;
1986 {
1987     int n;
1988 
1989     if (p == 0 || isMarked(p))
1990         return;
1991 
1992     Mark(p);
1993     n = n_blocks++;
1994     p->id = n;
1995     blocks[n] = p;
1996 
1997     number_blks_r(JT(p));
1998     number_blks_r(JF(p));
1999 }
2000 
2001 /*
2002  * Return the number of stmts in the flowgraph reachable by 'p'.
2003  * The nodes should be unmarked before calling.
2004  *
2005  * Note that "stmts" means "instructions", and that this includes
2006  *
2007  *	side-effect statements in 'p' (slength(p->stmts));
2008  *
2009  *	statements in the true branch from 'p' (count_stmts(JT(p)));
2010  *
2011  *	statements in the false branch from 'p' (count_stmts(JF(p)));
2012  *
2013  *	the conditional jump itself (1);
2014  *
2015  *	an extra long jump if the true branch requires it (p->longjt);
2016  *
2017  *	an extra long jump if the false branch requires it (p->longjf).
2018  */
2019 static int count_stmts(p)
2020      struct block *p;
2021 {
2022     int n;
2023 
2024     if (p == 0 || isMarked(p))
2025         return 0;
2026     Mark(p);
2027     n = count_stmts(JT(p)) + count_stmts(JF(p));
2028     return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2029 }
2030 
2031 /*
2032  * Allocate memory.  All allocation is done before optimization
2033  * is begun.  A linear bound on the size of all data structures is computed
2034  * from the total number of blocks and/or statements.
2035  */
2036 static void opt_init(root)
2037      struct block *root;
2038 {
2039     bpf_u_int32 *p;
2040     int i, n, max_stmts;
2041 
2042     /*
2043      * First, count the blocks, so we can malloc an array to map
2044      * block number to block.  Then, put the blocks into the array.
2045      */
2046     unMarkAll();
2047     n = count_blocks(root);
2048     blocks = (struct block **) calloc(n, sizeof(*blocks));
2049     if (blocks == NULL)
2050         bpf_error("malloc");
2051     unMarkAll();
2052     n_blocks = 0;
2053     number_blks_r(root);
2054 
2055     n_edges = 2 * n_blocks;
2056     edges = (struct edge **) calloc(n_edges, sizeof(*edges));
2057     if (edges == NULL)
2058         bpf_error("malloc");
2059 
2060     /*
2061      * The number of levels is bounded by the number of nodes.
2062      */
2063     levels = (struct block **) calloc(n_blocks, sizeof(*levels));
2064     if (levels == NULL)
2065         bpf_error("malloc");
2066 
2067     edgewords = n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2068     nodewords = n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2069 
2070     /* XXX */
2071     space = (bpf_u_int32 *) malloc(2 * n_blocks * nodewords * sizeof(*space)
2072                                    + n_edges * edgewords * sizeof(*space));
2073     if (space == NULL)
2074         bpf_error("malloc");
2075     p = space;
2076     all_dom_sets = p;
2077     for (i = 0; i < n; ++i)
2078     {
2079         blocks[i]->dom = p;
2080         p += nodewords;
2081     }
2082     all_closure_sets = p;
2083     for (i = 0; i < n; ++i)
2084     {
2085         blocks[i]->closure = p;
2086         p += nodewords;
2087     }
2088     all_edge_sets = p;
2089     for (i = 0; i < n; ++i)
2090     {
2091         register struct block *b = blocks[i];
2092 
2093         b->et.edom = p;
2094         p += edgewords;
2095         b->ef.edom = p;
2096         p += edgewords;
2097         b->et.id = i;
2098         edges[i] = &b->et;
2099         b->ef.id = n_blocks + i;
2100         edges[n_blocks + i] = &b->ef;
2101         b->et.pred = b;
2102         b->ef.pred = b;
2103     }
2104     max_stmts = 0;
2105     for (i = 0; i < n; ++i)
2106         max_stmts += slength(blocks[i]->stmts) + 1;
2107     /*
2108      * We allocate at most 3 value numbers per statement,
2109      * so this is an upper bound on the number of valnodes
2110      * we'll need.
2111      */
2112     maxval = 3 * max_stmts;
2113     vmap = (struct vmapinfo *) calloc(maxval, sizeof(*vmap));
2114     vnode_base = (struct valnode *) calloc(maxval, sizeof(*vnode_base));
2115     if (vmap == NULL || vnode_base == NULL)
2116         bpf_error("malloc");
2117 }
2118 
2119 /*
2120  * Some pointers used to convert the basic block form of the code,
2121  * into the array form that BPF requires.  'fstart' will point to
2122  * the malloc'd array while 'ftail' is used during the recursive traversal.
2123  */
2124 static __thread struct bpf_insn *fstart;
2125 static __thread struct bpf_insn *ftail;
2126 
2127 #ifdef BDEBUG
2128 int bids[1000];
2129 #endif
2130 
2131 /*
2132  * Returns true if successful.  Returns false if a branch has
2133  * an offset that is too large.  If so, we have marked that
2134  * branch so that on a subsequent iteration, it will be treated
2135  * properly.
2136  */
2137 static int convert_code_r(p)
2138      struct block *p;
2139 {
2140     struct bpf_insn *dst;
2141     struct slist *src;
2142     int slen;
2143     u_int off;
2144     int extrajmps;              /* number of extra jumps inserted */
2145     struct slist **offset = NULL;
2146 
2147     if (p == 0 || isMarked(p))
2148         return (1);
2149     Mark(p);
2150 
2151     if (convert_code_r(JF(p)) == 0)
2152         return (0);
2153     if (convert_code_r(JT(p)) == 0)
2154         return (0);
2155 
2156     slen = slength(p->stmts);
2157     dst = ftail -= (slen + 1 + p->longjt + p->longjf);
2158     /* inflate length by any extra jumps */
2159 
2160     p->offset = dst - fstart;
2161 
2162     /* generate offset[] for convenience  */
2163     if (slen)
2164     {
2165         offset = (struct slist **) calloc(slen, sizeof(struct slist *));
2166         if (!offset)
2167         {
2168             bpf_error("not enough core");
2169          /*NOTREACHED*/}
2170     }
2171     src = p->stmts;
2172     for (off = 0; off < slen && src; off++)
2173     {
2174 #if YYDEBUG
2175         printf("off=%d src=%x\n", off, src);
2176 #endif
2177         offset[off] = src;
2178         src = src->next;
2179     }
2180 
2181     off = 0;
2182     for (src = p->stmts; src; src = src->next)
2183     {
2184         if (src->s.code == NOP)
2185             continue;
2186         dst->code = (u_short) src->s.code;
2187         dst->k = src->s.k;
2188 
2189         /* fill block-local relative jump */
2190         if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP | BPF_JA))
2191         {
2192 #if YYDEBUG
2193             if (src->s.jt || src->s.jf)
2194             {
2195                 bpf_error("illegal jmp destination");
2196              /*NOTREACHED*/}
2197 #endif
2198             goto filled;
2199         }
2200         if (off == slen - 2)    /*??? */
2201             goto filled;
2202 
2203         {
2204             int i;
2205             int jt, jf;
2206             const char *ljerr = "%s for block-local relative jump: off=%d";
2207 
2208 #if YYDEBUG
2209             printf("code=%x off=%d %x %x\n", src->s.code, off, src->s.jt, src->s.jf);
2210 #endif
2211 
2212             if (!src->s.jt || !src->s.jf)
2213             {
2214                 bpf_error(ljerr, "no jmp destination", off);
2215              /*NOTREACHED*/}
2216 
2217             jt = jf = 0;
2218             for (i = 0; i < slen; i++)
2219             {
2220                 if (offset[i] == src->s.jt)
2221                 {
2222                     if (jt)
2223                     {
2224                         bpf_error(ljerr, "multiple matches", off);
2225                      /*NOTREACHED*/}
2226 
2227                     dst->jt = i - off - 1;
2228                     jt++;
2229                 }
2230                 if (offset[i] == src->s.jf)
2231                 {
2232                     if (jf)
2233                     {
2234                         bpf_error(ljerr, "multiple matches", off);
2235                      /*NOTREACHED*/}
2236                     dst->jf = i - off - 1;
2237                     jf++;
2238                 }
2239             }
2240             if (!jt || !jf)
2241             {
2242                 bpf_error(ljerr, "no destination found", off);
2243              /*NOTREACHED*/}
2244         }
2245       filled:
2246         ++dst;
2247         ++off;
2248     }
2249     if (offset)
2250         free(offset);
2251 
2252 #ifdef BDEBUG
2253     bids[dst - fstart] = p->id + 1;
2254 #endif
2255     dst->code = (u_short) p->s.code;
2256     dst->k = p->s.k;
2257     if (JT(p))
2258     {
2259         extrajmps = 0;
2260         off = JT(p)->offset - (p->offset + slen) - 1;
2261         if (off >= 256)
2262         {
2263             /* offset too large for branch, must add a jump */
2264             if (p->longjt == 0)
2265             {
2266                 /* mark this instruction and retry */
2267                 p->longjt++;
2268                 return (0);
2269             }
2270             /* branch if T to following jump */
2271             dst->jt = extrajmps;
2272             extrajmps++;
2273             dst[extrajmps].code = BPF_JMP | BPF_JA;
2274             dst[extrajmps].k = off - extrajmps;
2275         }
2276         else
2277             dst->jt = off;
2278         off = JF(p)->offset - (p->offset + slen) - 1;
2279         if (off >= 256)
2280         {
2281             /* offset too large for branch, must add a jump */
2282             if (p->longjf == 0)
2283             {
2284                 /* mark this instruction and retry */
2285                 p->longjf++;
2286                 return (0);
2287             }
2288             /* branch if F to following jump */
2289             /* if two jumps are inserted, F goes to second one */
2290             dst->jf = extrajmps;
2291             extrajmps++;
2292             dst[extrajmps].code = BPF_JMP | BPF_JA;
2293             dst[extrajmps].k = off - extrajmps;
2294         }
2295         else
2296             dst->jf = off;
2297     }
2298     return (1);
2299 }
2300 
2301 
2302 /*
2303  * Convert flowgraph intermediate representation to the
2304  * BPF array representation.  Set *lenp to the number of instructions.
2305  *
2306  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2307  * not* do free(fp) before returning fp; doing so would make no sense,
2308  * as the BPF array pointed to by the return value of icode_to_fcode()
2309  * must be valid - it's being returned for use in a bpf_program structure.
2310  *
2311  * If it appears that icode_to_fcode() is leaking, the problem is that
2312  * the program using pcap_compile() is failing to free the memory in
2313  * the BPF program when it's done - the leak is in the program, not in
2314  * the routine that happens to be allocating the memory.  (By analogy, if
2315  * a program calls fopen() without ever calling fclose() on the FILE *,
2316  * it will leak the FILE structure; the leak is not in fopen(), it's in
2317  * the program.)  Change the program to use pcap_freecode() when it's
2318  * done with the filter program.  See the pcap man page.
2319  */
2320 struct bpf_insn *icode_to_fcode(root, lenp)
2321      struct block *root;
2322      int *lenp;
2323 {
2324     int n;
2325     struct bpf_insn *fp;
2326 
2327     /*
2328      * Loop doing convert_code_r() until no branches remain
2329      * with too-large offsets.
2330      */
2331     while (1)
2332     {
2333         unMarkAll();
2334         n = *lenp = count_stmts(root);
2335 
2336         fp = (struct bpf_insn *) malloc(sizeof(*fp) * n);
2337         if (fp == NULL)
2338             bpf_error("malloc");
2339         memset((char *) fp, 0, sizeof(*fp) * n);
2340         fstart = fp;
2341         ftail = fp + n;
2342 
2343         unMarkAll();
2344         if (convert_code_r(root))
2345             break;
2346         free(fp);
2347     }
2348 
2349     return fp;
2350 }
2351 
2352 #ifdef BDEBUG
2353 static void opt_dump(root)
2354      struct block *root;
2355 {
2356     struct bpf_program f;
2357 
2358     memset(bids, 0, sizeof bids);
2359     f.bf_insns = icode_to_fcode(root, &f.bf_len);
2360     bpf_dump(&f, 1);
2361     putchar('\n');
2362     free((char *) f.bf_insns);
2363 }
2364 #endif
2365