1 //! Support for egraphs represented in the DataFlowGraph.
2 
3 use crate::FxHashSet;
4 use crate::alias_analysis::{AliasAnalysis, LastStores};
5 use crate::ctxhash::{CtxEq, CtxHash, NullCtx};
6 use crate::cursor::{Cursor, CursorPosition, FuncCursor};
7 use crate::dominator_tree::DominatorTree;
8 use crate::egraph::elaborate::Elaborator;
9 use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph};
10 use crate::ir::{
11     Block, DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool,
12 };
13 use crate::loop_analysis::LoopAnalysis;
14 use crate::opts::IsleContext;
15 use crate::opts::generated_code::SkeletonInstSimplification;
16 use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap};
17 use crate::take_and_replace::TakeAndReplace;
18 use crate::trace;
19 use alloc::vec::Vec;
20 use core::cmp::Ordering;
21 use core::hash::Hasher;
22 use cranelift_control::ControlPlane;
23 use cranelift_entity::SecondaryMap;
24 use cranelift_entity::packed_option::ReservedValue;
25 use smallvec::SmallVec;
26 
27 mod cost;
28 mod elaborate;
29 
30 /// Pass over a Function that does the whole aegraph thing.
31 ///
32 /// - Removes non-skeleton nodes from the Layout.
33 /// - Performs a GVN-and-rule-application pass over all Values
34 ///   reachable from the skeleton, potentially creating new Union
35 ///   nodes (i.e., an aegraph) so that some values have multiple
36 ///   representations.
37 /// - Does "extraction" on the aegraph: selects the best value out of
38 ///   the tree-of-Union nodes for each used value.
39 /// - Does "scoped elaboration" on the aegraph: chooses one or more
40 ///   locations for pure nodes to become instructions again in the
41 ///   layout, as forced by the skeleton.
42 ///
43 /// At the beginning and end of this pass, the CLIF should be in a
44 /// state that passes the verifier and, additionally, has no Union
45 /// nodes. During the pass, Union nodes may exist, and instructions in
46 /// the layout may refer to results of instructions that are not
47 /// placed in the layout.
48 pub struct EgraphPass<'a> {
49     /// The function we're operating on.
50     func: &'a mut Function,
51     /// Dominator tree for the CFG, used to visit blocks in pre-order
52     /// so we see value definitions before their uses, and also used for
53     /// O(1) dominance checks.
54     domtree: &'a DominatorTree,
55     /// Alias analysis, used during optimization.
56     alias_analysis: &'a mut AliasAnalysis<'a>,
57     /// Loop analysis results, used for built-in LICM during
58     /// elaboration.
59     loop_analysis: &'a LoopAnalysis,
60     /// Chaos-mode control-plane so we can test that we still get
61     /// correct results when our heuristics make bad decisions.
62     ctrl_plane: &'a mut ControlPlane,
63     /// Which Values do we want to rematerialize in each block where
64     /// they're used?
65     remat_values: FxHashSet<Value>,
66     /// Stats collected while we run this pass.
67     pub(crate) stats: Stats,
68 }
69 
70 /// The maximum number of rewrites we will take from a single call into ISLE.
71 const MATCHES_LIMIT: usize = 5;
72 
73 /// The maximum number of enodes in any given eclass.
74 const ECLASS_ENODE_LIMIT: usize = 5;
75 
76 /// Context passed through node insertion and optimization.
77 pub(crate) struct OptimizeCtx<'opt, 'analysis>
78 where
79     'analysis: 'opt,
80 {
81     // Borrowed from EgraphPass:
82     pub(crate) func: &'opt mut Function,
83     pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>,
84     available_block: &'opt mut SecondaryMap<Value, Block>,
85     eclass_size: &'opt mut SecondaryMap<Value, u8>,
86     pub(crate) gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Option<Value>>,
87     pub(crate) gvn_map_blocks: &'opt Vec<Block>,
88     pub(crate) remat_values: &'opt mut FxHashSet<Value>,
89     pub(crate) stats: &'opt mut Stats,
90     domtree: &'opt DominatorTree,
91     pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>,
92     pub(crate) alias_analysis_state: &'opt mut LastStores,
93     ctrl_plane: &'opt mut ControlPlane,
94     // Held locally during optimization of one node (recursively):
95     pub(crate) rewrite_depth: usize,
96     pub(crate) subsume_values: FxHashSet<Value>,
97     optimized_values: SmallVec<[Value; MATCHES_LIMIT]>,
98     optimized_insts: SmallVec<[SkeletonInstSimplification; MATCHES_LIMIT]>,
99 }
100 
101 /// For passing to `insert_pure_enode`. Sometimes the enode already
102 /// exists as an Inst (from the original CLIF), and sometimes we're in
103 /// the middle of creating it and want to avoid inserting it if
104 /// possible until we know we need it.
105 pub(crate) enum NewOrExistingInst {
106     New(InstructionData, Type),
107     Existing(Inst),
108 }
109 
110 impl NewOrExistingInst {
get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData)111     fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) {
112         match self {
113             NewOrExistingInst::New(data, ty) => (*ty, *data),
114             NewOrExistingInst::Existing(inst) => {
115                 let ty = dfg.ctrl_typevar(*inst);
116                 (ty, dfg.insts[*inst])
117             }
118         }
119     }
120 }
121 
122 impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis>
123 where
124     'analysis: 'opt,
125 {
126     /// Optimization of a single instruction.
127     ///
128     /// This does a few things:
129     /// - Looks up the instruction in the GVN deduplication map. If we
130     ///   already have the same instruction somewhere else, with the
131     ///   same args, then we can alias the original instruction's
132     ///   results and omit this instruction entirely.
133     /// - If the instruction is "new" (not deduplicated), then apply
134     ///   optimization rules:
135     ///   - All of the mid-end rules written in ISLE.
136     ///   - Store-to-load forwarding.
137     /// - Update the value-to-opt-value map, and update the eclass
138     ///   union-find, if we rewrote the value to different form(s).
insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value139     pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value {
140         // Create the external context for looking up and updating the
141         // GVN map. This is necessary so that instructions themselves
142         // do not have to carry all the references or data for a full
143         // `Eq` or `Hash` impl.
144         let gvn_context = GVNContext {
145             value_lists: &self.func.dfg.value_lists,
146         };
147 
148         self.stats.pure_inst += 1;
149         if let NewOrExistingInst::New(..) = inst {
150             self.stats.new_inst += 1;
151         }
152 
153         // Does this instruction already exist? If so, add entries to
154         // the value-map to rewrite uses of its results to the results
155         // of the original (existing) instruction. If not, optimize
156         // the new instruction.
157         if let Some(&Some(orig_result)) = self
158             .gvn_map
159             .get(&gvn_context, &inst.get_inst_key(&self.func.dfg))
160         {
161             self.stats.pure_inst_deduped += 1;
162             if let NewOrExistingInst::Existing(inst) = inst {
163                 debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1);
164                 let result = self.func.dfg.first_result(inst);
165                 self.value_to_opt_value[result] = orig_result;
166                 self.available_block[result] = self.available_block[orig_result];
167             }
168             orig_result
169         } else {
170             // Now actually insert the InstructionData and attach
171             // result value (exactly one).
172             let (inst, result, ty) = match inst {
173                 NewOrExistingInst::New(data, typevar) => {
174                     self.stats.pure_inst_insert_new += 1;
175                     let inst = self.func.dfg.make_inst(data);
176                     // TODO: reuse return value?
177                     self.func.dfg.make_inst_results(inst, typevar);
178                     let result = self.func.dfg.first_result(inst);
179                     // New inst. We need to do the analysis of its result.
180                     (inst, result, typevar)
181                 }
182                 NewOrExistingInst::Existing(inst) => {
183                     self.stats.pure_inst_insert_orig += 1;
184                     let result = self.func.dfg.first_result(inst);
185                     let ty = self.func.dfg.ctrl_typevar(inst);
186                     (inst, result, ty)
187                 }
188             };
189 
190             self.available_block[result] = self.get_available_block(inst);
191             let opt_value = self.optimize_pure_enode(inst);
192             log::trace!("optimizing inst {inst} orig result {result} gave {opt_value}");
193 
194             let gvn_context = GVNContext {
195                 value_lists: &self.func.dfg.value_lists,
196             };
197             // Insert at level implied by args. This enables merging
198             // in LICM cases like:
199             //
200             // while (...) {
201             //   if (...) {
202             //     let x = loop_invariant_expr;
203             //   }
204             //   if (...) {
205             //     let x = loop_invariant_expr;
206             //   }
207             // }
208             //
209             // where the two instances of the expression otherwise
210             // wouldn't merge because each would be in a separate
211             // subscope of the scoped hashmap during traversal.
212             log::trace!(
213                 "value {} is available at {}",
214                 opt_value,
215                 self.available_block[opt_value]
216             );
217             let depth = self.depth_of_block_in_gvn_map(self.available_block[opt_value]);
218             self.gvn_map.insert_with_depth(
219                 &gvn_context,
220                 (ty, self.func.dfg.insts[inst]),
221                 Some(opt_value),
222                 depth,
223             );
224             self.value_to_opt_value[result] = opt_value;
225             opt_value
226         }
227     }
228 
229     /// Find the block where a pure instruction first becomes available,
230     /// defined as the block that is closest to the root where all of
231     /// its arguments are available. In the unusual case where a pure
232     /// instruction has no arguments (e.g. get_return_address), we can
233     /// place it anywhere, so it is available in the entry block.
234     ///
235     /// This function does not compute available blocks recursively.
236     /// All of the instruction's arguments must have had their available
237     /// blocks assigned already.
get_available_block(&self, inst: Inst) -> Block238     fn get_available_block(&self, inst: Inst) -> Block {
239         // Side-effecting instructions have different rules for where
240         // they become available, so this function does not apply.
241         debug_assert!(is_pure_for_egraph(self.func, inst));
242 
243         // Note that the def-point of all arguments to an instruction
244         // in SSA lie on a line of direct ancestors in the domtree, and
245         // so do their available-blocks. This means that for any pair of
246         // arguments, their available blocks are either the same or one
247         // strictly dominates the other. We just need to find any argument
248         // whose available block is deepest in the domtree.
249         self.func.dfg.insts[inst]
250             .arguments(&self.func.dfg.value_lists)
251             .iter()
252             .map(|&v| {
253                 let block = self.available_block[v];
254                 debug_assert!(!block.is_reserved_value());
255                 block
256             })
257             .max_by(|&x, &y| {
258                 if self.domtree.block_dominates(x, y) {
259                     Ordering::Less
260                 } else {
261                     debug_assert!(self.domtree.block_dominates(y, x));
262                     Ordering::Greater
263                 }
264             })
265             .unwrap_or(self.func.layout.entry_block().unwrap())
266     }
267 
depth_of_block_in_gvn_map(&self, block: Block) -> usize268     fn depth_of_block_in_gvn_map(&self, block: Block) -> usize {
269         log::trace!(
270             "finding depth of available block {} in domtree stack: {:?}",
271             block,
272             self.gvn_map_blocks
273         );
274         self.gvn_map_blocks
275             .iter()
276             .enumerate()
277             .rev()
278             .find(|&(_, b)| *b == block)
279             .unwrap()
280             .0
281     }
282 
283     /// Optimizes an enode by applying any matching mid-end rewrite
284     /// rules (or store-to-load forwarding, which is a special case),
285     /// unioning together all possible optimized (or rewritten) forms
286     /// of this expression into an eclass and returning the `Value`
287     /// that represents that eclass.
optimize_pure_enode(&mut self, inst: Inst) -> Value288     fn optimize_pure_enode(&mut self, inst: Inst) -> Value {
289         // A pure node always has exactly one result.
290         let orig_value = self.func.dfg.first_result(inst);
291 
292         let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_values);
293         let (ctx, optimized_values) = guard.get();
294 
295         // Limit rewrite depth. When we apply optimization rules, they
296         // may create new nodes (values) and those are, recursively,
297         // optimized eagerly as soon as they are created. So we may
298         // have more than one ISLE invocation on the stack. (This is
299         // necessary so that as the toplevel builds the
300         // right-hand-side expression bottom-up, it uses the "latest"
301         // optimized values for all the constituent parts.) To avoid
302         // infinite or problematic recursion, we bound the rewrite
303         // depth to a small constant here.
304         const REWRITE_LIMIT: usize = 5;
305         if ctx.rewrite_depth > REWRITE_LIMIT {
306             ctx.stats.rewrite_depth_limit += 1;
307             return orig_value;
308         }
309         ctx.rewrite_depth += 1;
310         trace!("Incrementing rewrite depth; now {}", ctx.rewrite_depth);
311 
312         // Invoke the ISLE toplevel constructor, getting all new
313         // values produced as equivalents to this value.
314         trace!("Calling into ISLE with original value {}", orig_value);
315         ctx.stats.rewrite_rule_invoked += 1;
316         debug_assert!(optimized_values.is_empty());
317         crate::opts::generated_code::constructor_simplify(
318             &mut IsleContext { ctx },
319             orig_value,
320             optimized_values,
321         );
322 
323         ctx.stats.rewrite_rule_results += optimized_values.len() as u64;
324 
325         // It's not supposed to matter what order `simplify` returns values in.
326         ctx.ctrl_plane.shuffle(optimized_values);
327 
328         let num_matches = optimized_values.len();
329         if num_matches > MATCHES_LIMIT {
330             trace!(
331                 "Reached maximum matches limit; too many optimized values \
332                  ({num_matches} > {MATCHES_LIMIT}); ignoring rest.",
333             );
334             optimized_values.truncate(MATCHES_LIMIT);
335         }
336 
337         // Sort and deduplicate optimized values, in case multiple
338         // rules produced the same simplification.
339         optimized_values.sort_unstable();
340         optimized_values.dedup();
341 
342         trace!("  -> returned from ISLE: {orig_value} -> {optimized_values:?}");
343 
344         // Construct a union-node tree representing the new eclass
345         // that results from rewriting. If any returned value was
346         // marked "subsume", take only that value. Otherwise,
347         // sequentially build the chain over the original value and
348         // all returned values.
349         let result_value = if let Some(&subsuming_value) = optimized_values
350             .iter()
351             .find(|&value| ctx.subsume_values.contains(value))
352         {
353             optimized_values.clear();
354             ctx.stats.pure_inst_subsume += 1;
355             subsuming_value
356         } else {
357             let mut union_value = orig_value;
358             let mut eclass_size = ctx.eclass_size[orig_value] + 1;
359             for optimized_value in optimized_values.drain(..) {
360                 trace!(
361                     "Returned from ISLE for {}, got {:?}",
362                     orig_value, optimized_value
363                 );
364                 if optimized_value == orig_value {
365                     trace!(" -> same as orig value; skipping");
366                     ctx.stats.pure_inst_rewrite_to_self += 1;
367                     continue;
368                 }
369                 let rhs_eclass_size = ctx.eclass_size[optimized_value] + 1;
370                 if usize::from(eclass_size) + usize::from(rhs_eclass_size) > ECLASS_ENODE_LIMIT {
371                     trace!(" -> reached eclass size limit");
372                     ctx.stats.eclass_size_limit += 1;
373                     break;
374                 }
375                 let old_union_value = union_value;
376                 union_value = ctx.func.dfg.union(old_union_value, optimized_value);
377                 eclass_size += rhs_eclass_size;
378                 ctx.eclass_size[union_value] = eclass_size - 1;
379                 ctx.stats.union += 1;
380                 trace!(" -> union: now {}", union_value);
381                 ctx.available_block[union_value] =
382                     ctx.merge_availability(old_union_value, optimized_value);
383             }
384             union_value
385         };
386 
387         ctx.rewrite_depth -= 1;
388         trace!("Decrementing rewrite depth; now {}", ctx.rewrite_depth);
389         if ctx.rewrite_depth == 0 {
390             ctx.subsume_values.clear();
391         }
392 
393         debug_assert!(ctx.optimized_values.is_empty());
394         result_value
395     }
396 
merge_availability(&self, a: Value, b: Value) -> Block397     fn merge_availability(&self, a: Value, b: Value) -> Block {
398         let a = self.available_block[a];
399         let b = self.available_block[b];
400         if self.domtree.block_dominates(a, b) {
401             a
402         } else {
403             b
404         }
405     }
406 
407     /// Optimize a "skeleton" instruction.
408     ///
409     /// Returns an optional command of how to continue processing the optimized
410     /// instruction (e.g. removing it or replacing it with a new instruction).
optimize_skeleton_inst( &mut self, inst: Inst, block: Block, ) -> Option<SkeletonInstSimplification>411     fn optimize_skeleton_inst(
412         &mut self,
413         inst: Inst,
414         block: Block,
415     ) -> Option<SkeletonInstSimplification> {
416         self.stats.skeleton_inst += 1;
417 
418         // If we have a rewrite rule for this instruction, do that first, so
419         // that GVN and alias analysis only see simplified skeleton
420         // instructions.
421         if let Some(cmd) = self.simplify_skeleton_inst(inst) {
422             self.stats.skeleton_inst_simplified += 1;
423             return Some(cmd);
424         }
425 
426         // First, can we try to deduplicate? We need to keep some copy
427         // of the instruction around because it's side-effecting, but
428         // we may be able to reuse an earlier instance of it.
429         if is_mergeable_for_egraph(self.func, inst) {
430             let result = self.func.dfg.inst_results(inst).get(0).copied();
431             trace!(" -> mergeable side-effecting op {}", inst);
432 
433             // Does this instruction already exist? If so, add entries to
434             // the value-map to rewrite uses of its results to the results
435             // of the original (existing) instruction. If not, optimize
436             // the new instruction.
437             //
438             // Note that the GVN map is scoped, which is important
439             // here: because effectful ops are not removed from the
440             // skeleton (`Layout`), we need to be mindful of whether
441             // our current position is dominated by an instance of the
442             // instruction. (See #5796 for details.)
443             let ty = self.func.dfg.ctrl_typevar(inst);
444             match self
445                 .gvn_map
446                 .entry(&NullCtx, (ty, self.func.dfg.insts[inst]))
447             {
448                 ScopedEntry::Occupied(o) => {
449                     let orig_result = *o.get();
450                     match (result, orig_result) {
451                         (Some(result), Some(orig_result)) => {
452                             // Hit in GVN map -- reuse value.
453                             self.stats.skeleton_inst_gvn += 1;
454                             self.value_to_opt_value[result] = orig_result;
455                             self.available_block[result] = self.available_block[orig_result];
456                             trace!(" -> merges result {} to {}", result, orig_result);
457                         }
458                         (None, None) => {
459                             // Hit in the GVN map, but the instruction doesn't
460                             // produce results, only side effects. Nothing else
461                             // to do here.
462                             self.stats.skeleton_inst_gvn += 1;
463                             trace!(" -> merges with dominating instruction");
464                         }
465                         (_, _) => unreachable!(),
466                     }
467                     Some(SkeletonInstSimplification::Remove)
468                 }
469                 ScopedEntry::Vacant(v) => {
470                     // Otherwise, insert it into the value-map.
471                     if let Some(result) = result {
472                         self.value_to_opt_value[result] = result;
473                         self.available_block[result] = block;
474                     }
475                     v.insert(result);
476                     trace!(" -> inserts as new (no GVN)");
477                     None
478                 }
479             }
480         }
481         // Otherwise, if a load or store, process it with the alias
482         // analysis to see if we can optimize it (rewrite in terms of
483         // an earlier load or stored value).
484         else if let Some(new_result) =
485             self.alias_analysis
486                 .process_inst(self.func, self.alias_analysis_state, inst)
487         {
488             self.stats.alias_analysis_removed += 1;
489             let result = self.func.dfg.first_result(inst);
490             trace!(
491                 " -> inst {} has result {} replaced with {}",
492                 inst, result, new_result
493             );
494             self.value_to_opt_value[result] = new_result;
495             self.available_block[result] = self.available_block[new_result];
496             Some(SkeletonInstSimplification::Remove)
497         }
498         // Otherwise, generic side-effecting op -- always keep it, and
499         // set its results to identity-map to original values.
500         else {
501             // Set all results to identity-map to themselves
502             // in the value-to-opt-value map.
503             for &result in self.func.dfg.inst_results(inst) {
504                 self.value_to_opt_value[result] = result;
505                 self.available_block[result] = block;
506             }
507             None
508         }
509     }
510 
511     /// Find the best simplification of the given skeleton instruction, if any,
512     /// by consulting our `simplify_skeleton` ISLE rules.
simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification>513     fn simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification> {
514         // We cannot currently simplify terminators, or simplify into
515         // terminators. Anything that could change the control-flow graph is off
516         // limits.
517         //
518         // Consider the following CLIF snippet:
519         //
520         //     block0(v0: i64):
521         //         v1 = iconst.i32 0
522         //         trapz v1, user42
523         //         v2 = load.i32 v0
524         //         brif v1, block1, block2
525         //     block1:
526         //         return v2
527         //     block2:
528         //         v3 = iconst.i32 1
529         //         v4 = iadd v2, v3
530         //         return v4
531         //
532         // We would ideally like to perform simplifications like replacing the
533         // `trapz` with an unconditional `trap` and the conditional `brif`
534         // branch with an unconditional `jump`. Note, however, that blocks
535         // `block1` and `block2` are dominated by `block0` and therefore can and
536         // do use values defined in `block0`. This presents challenges:
537         //
538         // * If we replace the `brif` with a `jump`, then we've mutated the
539         //   control-flow graph and removed that domination property. The uses
540         //   of `v2` and `v3` in those blocks become invalid.
541         //
542         // * Even worse, if we turn the `trapz` into a `trap`, we are
543         //   introducing a terminator into the middle of the block, which leaves
544         //   us with two choices to fix up the IR so that there aren't any
545         //   instructions following the terminator in the block:
546         //
547         //   1. We can split the unreachable instructions off into a new
548         //      block. However, there is no control-flow edge from the current
549         //      block to this new block and so, again, the new block isn't
550         //      dominated by the current block, and therefore the can't use
551         //      values defined in this block or any dominating it. The `load`
552         //      instruction uses `v0` but is not dominated by `v0`'s
553         //      definition.
554         //
555         //   2. Alternatively, we can simply delete the trailing instructions,
556         //      since they are unreachable. But then not only are the old
557         //      instructions' uses no longer dominated by their definitions, but
558         //      the definitions do not exist at all anymore!
559         //
560         // Whatever approach we would take, we would invalidate value uses, and
561         // would need to track and fix them up.
562         if self.func.dfg.insts[inst].opcode().is_branch() {
563             return None;
564         }
565 
566         let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_insts);
567         let (ctx, optimized_insts) = guard.get();
568 
569         crate::opts::generated_code::constructor_simplify_skeleton(
570             &mut IsleContext { ctx },
571             inst,
572             optimized_insts,
573         );
574 
575         let simplifications_len = optimized_insts.len();
576         log::trace!(" -> simplify_skeleton: yielded {simplifications_len} simplification(s)");
577         if simplifications_len > MATCHES_LIMIT {
578             log::trace!("      too many candidate simplifications; truncating to {MATCHES_LIMIT}");
579             optimized_insts.truncate(MATCHES_LIMIT);
580         }
581 
582         // Find the best simplification, if any, from our candidates.
583         //
584         // Unlike simplifying pure values, we do not add side-effectful
585         // instructions to the egraph, nor do we extract the best version via
586         // dynamic programming and considering the costs of operands. Instead,
587         // we greedily choose the best simplification. This is because there is
588         // an impedance mismatch: the egraph and our pure rewrites are centered
589         // around *values*, but we don't represent side-effects with values, we
590         // represent them implicitly in their *instructions*.
591         //
592         // The initial best choice is "no simplification, just use the original
593         // instruction" which has the original instruction's cost.
594         let mut best = None;
595         let mut best_cost = cost::Cost::of_skeleton_op(
596             ctx.func.dfg.insts[inst].opcode(),
597             ctx.func.dfg.inst_args(inst).len(),
598         );
599         while let Some(simplification) = optimized_insts.pop() {
600             let (new_inst, new_val) = match simplification {
601                 // We can't do better than completely removing the skeleton
602                 // instruction, so short-cicuit the loop and eagerly return the
603                 // `Remove*` simplifications.
604                 SkeletonInstSimplification::Remove => {
605                     log::trace!(" -> simplify_skeleton: remove inst");
606                     debug_assert!(ctx.func.dfg.inst_results(inst).is_empty());
607                     return Some(simplification);
608                 }
609                 SkeletonInstSimplification::RemoveWithVal { val } => {
610                     log::trace!(" -> simplify_skeleton: remove inst and use {val} as its result");
611                     if cfg!(debug_assertions) {
612                         let results = ctx.func.dfg.inst_results(inst);
613                         debug_assert_eq!(results.len(), 1);
614                         debug_assert_eq!(
615                             ctx.func.dfg.value_type(results[0]),
616                             ctx.func.dfg.value_type(val),
617                         );
618                     }
619                     return Some(simplification);
620                 }
621 
622                 // For instruction replacement simplification, we want to check
623                 // that the replacements define the same number and types of
624                 // values as the original instruction, and also determine
625                 // whether they are actually an improvement over (i.e. have
626                 // lower cost than) the original instruction.
627                 SkeletonInstSimplification::Replace { inst } => {
628                     log::trace!(
629                         " -> simplify_skeleton: replace inst with {inst}: {}",
630                         ctx.func.dfg.display_inst(inst)
631                     );
632                     (inst, None)
633                 }
634                 SkeletonInstSimplification::ReplaceWithVal { inst, val } => {
635                     log::trace!(
636                         " -> simplify_skeleton: replace inst with {val} and {inst}: {}",
637                         ctx.func.dfg.display_inst(inst)
638                     );
639                     (inst, Some(val))
640                 }
641             };
642 
643             if cfg!(debug_assertions) {
644                 let opcode = ctx.func.dfg.insts[inst].opcode();
645                 debug_assert!(
646                     !(opcode.is_terminator() || opcode.is_branch()),
647                     "simplifying control-flow instructions and terminators is not yet supported",
648                 );
649 
650                 let old_vals = ctx.func.dfg.inst_results(inst);
651                 let new_vals = if let Some(val) = new_val.as_ref() {
652                     core::slice::from_ref(val)
653                 } else {
654                     ctx.func.dfg.inst_results(new_inst)
655                 };
656                 debug_assert_eq!(
657                     old_vals.len(),
658                     new_vals.len(),
659                     "skeleton simplification should result in the same number of result values",
660                 );
661 
662                 for (old_val, new_val) in old_vals.iter().zip(new_vals) {
663                     let old_ty = ctx.func.dfg.value_type(*old_val);
664                     let new_ty = ctx.func.dfg.value_type(*new_val);
665                     debug_assert_eq!(
666                         old_ty, new_ty,
667                         "skeleton simplification should result in values of the correct type",
668                     );
669                 }
670             }
671 
672             // Our best simplification is the one with the least cost. Update
673             // `best` if necessary.
674             let cost = cost::Cost::of_skeleton_op(
675                 ctx.func.dfg.insts[new_inst].opcode(),
676                 ctx.func.dfg.inst_args(new_inst).len(),
677             );
678             if cost < best_cost {
679                 best = Some(simplification);
680                 best_cost = cost;
681             }
682         }
683 
684         // Return the best simplification!
685         best
686     }
687 }
688 
689 impl<'a> EgraphPass<'a> {
690     /// Create a new EgraphPass.
new( func: &'a mut Function, domtree: &'a DominatorTree, loop_analysis: &'a LoopAnalysis, alias_analysis: &'a mut AliasAnalysis<'a>, ctrl_plane: &'a mut ControlPlane, ) -> Self691     pub fn new(
692         func: &'a mut Function,
693         domtree: &'a DominatorTree,
694         loop_analysis: &'a LoopAnalysis,
695         alias_analysis: &'a mut AliasAnalysis<'a>,
696         ctrl_plane: &'a mut ControlPlane,
697     ) -> Self {
698         Self {
699             func,
700             domtree,
701             loop_analysis,
702             alias_analysis,
703             ctrl_plane,
704             stats: Stats::default(),
705             remat_values: FxHashSet::default(),
706         }
707     }
708 
709     /// Run the process.
run(&mut self)710     pub fn run(&mut self) {
711         self.remove_pure_and_optimize();
712 
713         trace!("egraph built:\n{}\n", self.func.display());
714         if cfg!(feature = "trace-log") {
715             for (value, def) in self.func.dfg.values_and_defs() {
716                 trace!(" -> {} = {:?}", value, def);
717                 match def {
718                     ValueDef::Result(i, 0) => {
719                         trace!("  -> {} = {:?}", i, self.func.dfg.insts[i]);
720                     }
721                     _ => {}
722                 }
723             }
724         }
725 
726         self.elaborate();
727 
728         log::trace!("stats: {:#?}", self.stats);
729     }
730 
731     /// Remove pure nodes from the `Layout` of the function, ensuring
732     /// that only the "side-effect skeleton" remains, and also
733     /// optimize the pure nodes. This is the first step of
734     /// egraph-based processing and turns the pure CFG-based CLIF into
735     /// a CFG skeleton with a sea of (optimized) nodes tying it
736     /// together.
737     ///
738     /// As we walk through the code, we eagerly apply optimization
739     /// rules; at any given point we have a "latest version" of an
740     /// eclass of possible representations for a `Value` in the
741     /// original program, which is itself a `Value` at the root of a
742     /// union-tree. We keep a map from the original values to these
743     /// optimized values. When we encounter any instruction (pure or
744     /// side-effecting skeleton) we rewrite its arguments to capture
745     /// the "latest" optimized forms of these values. (We need to do
746     /// this as part of this pass, and not later using a finished map,
747     /// because the eclass can continue to be updated and we need to
748     /// only refer to its subset that exists at this stage, to
749     /// maintain acyclicity.)
remove_pure_and_optimize(&mut self)750     fn remove_pure_and_optimize(&mut self) {
751         let mut cursor = FuncCursor::new(self.func);
752         let mut value_to_opt_value: SecondaryMap<Value, Value> =
753             SecondaryMap::with_default(Value::reserved_value());
754 
755         // Map from instruction to value for hash-consing of pure ops
756         // into the egraph. This can be a standard (non-scoped)
757         // hashmap because pure ops have no location: they are
758         // "outside of" control flow.
759         //
760         // Note also that we keep the controlling typevar (the `Type`
761         // in the tuple below) because it may disambiguate
762         // instructions that are identical except for type.
763         //
764         // We store both skeleton and non-skeleton instructions in the
765         // GVN map; for skeleton instructions, we only store those
766         // that are idempotent, i.e., still eligible to GVN. Note that
767         // some skeleton instructions are idempotent but do not
768         // produce a value: e.g., traps on a given condition. To allow
769         // for both cases, we store an `Option<Value>` as the value in
770         // this map.
771         let mut gvn_map: ScopedHashMap<(Type, InstructionData), Option<Value>> =
772             ScopedHashMap::with_capacity(cursor.func.dfg.num_values());
773 
774         // The block in the domtree preorder traversal at each level
775         // of the GVN map.
776         let mut gvn_map_blocks: Vec<Block> = vec![];
777 
778         // To get the best possible merging and canonicalization, we
779         // track where a value is "available" at: this is the
780         // domtree-nearest-ancestor join of all args if the value
781         // itself is pure, otherwise the block where the value is
782         // defined. (And for union nodes, the
783         // domtree-highest-ancestor, i.e., the meet or the dual to the
784         // above join.)
785         let mut available_block: SecondaryMap<Value, Block> =
786             SecondaryMap::with_default(Block::reserved_value());
787 
788         // To avoid blowing up eclasses too much, we track the size of
789         // each eclass reachable by a tree of union nodes from a given
790         // value ID, and we avoid union'ing additional values into an
791         // eclass when it reaches `ECLASS_ENODE_LIMIT`.
792         //
793         // For efficiency, this encodes size minus one: so a value of
794         // zero (which is cheap to bulk-initialize) means a singleton
795         // eclass of size one. This also allows us to avoid explicitly
796         // writing the size for any values that are not union nodes.
797         let mut eclass_size: SecondaryMap<Value, u8> = SecondaryMap::with_default(0);
798 
799         // This is an initial guess at the size we'll need, but we add
800         // more values as we build simplified alternative expressions so
801         // this is likely to realloc again later.
802         available_block.resize(cursor.func.dfg.num_values());
803 
804         // In domtree preorder, visit blocks. (TODO: factor out an
805         // iterator from this and elaborator.)
806         let root = cursor.layout().entry_block().unwrap();
807         enum StackEntry {
808             Visit(Block),
809             Pop,
810         }
811         let mut block_stack = vec![StackEntry::Visit(root)];
812         while let Some(entry) = block_stack.pop() {
813             match entry {
814                 StackEntry::Visit(block) => {
815                     // We popped this block; push children
816                     // immediately, then process this block.
817                     block_stack.push(StackEntry::Pop);
818                     block_stack.extend(
819                         self.ctrl_plane
820                             .shuffled(self.domtree.children(block))
821                             .map(StackEntry::Visit),
822                     );
823                     gvn_map.increment_depth();
824                     gvn_map_blocks.push(block);
825 
826                     trace!("Processing block {}", block);
827                     cursor.set_position(CursorPosition::Before(block));
828 
829                     let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);
830 
831                     for &param in cursor.func.dfg.block_params(block) {
832                         trace!("creating initial singleton eclass for blockparam {}", param);
833                         value_to_opt_value[param] = param;
834                         available_block[param] = block;
835                     }
836                     while let Some(inst) = cursor.next_inst() {
837                         trace!(
838                             "Processing inst {inst}: {}",
839                             cursor.func.dfg.display_inst(inst),
840                         );
841 
842                         // Rewrite args of *all* instructions using the
843                         // value-to-opt-value map.
844                         cursor.func.dfg.map_inst_values(inst, |arg| {
845                             let new_value = value_to_opt_value[arg];
846                             trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
847                             debug_assert_ne!(new_value, Value::reserved_value());
848                             new_value
849                         });
850 
851                         // Build a context for optimization, with borrows of
852                         // state. We can't invoke a method on `self` because
853                         // we've borrowed `self.func` mutably (as
854                         // `cursor.func`) so we pull apart the pieces instead
855                         // here.
856                         let mut ctx = OptimizeCtx {
857                             func: cursor.func,
858                             value_to_opt_value: &mut value_to_opt_value,
859                             gvn_map: &mut gvn_map,
860                             gvn_map_blocks: &mut gvn_map_blocks,
861                             available_block: &mut available_block,
862                             eclass_size: &mut eclass_size,
863                             rewrite_depth: 0,
864                             subsume_values: FxHashSet::default(),
865                             remat_values: &mut self.remat_values,
866                             stats: &mut self.stats,
867                             domtree: &self.domtree,
868                             alias_analysis: self.alias_analysis,
869                             alias_analysis_state: &mut alias_analysis_state,
870                             ctrl_plane: self.ctrl_plane,
871                             optimized_values: Default::default(),
872                             optimized_insts: Default::default(),
873                         };
874 
875                         if is_pure_for_egraph(ctx.func, inst) {
876                             // Insert into GVN map and optimize any new nodes
877                             // inserted (recursively performing this work for
878                             // any nodes the optimization rules produce).
879                             let inst = NewOrExistingInst::Existing(inst);
880                             ctx.insert_pure_enode(inst);
881                             // We've now rewritten all uses, or will when we
882                             // see them, and the instruction exists as a pure
883                             // enode in the eclass, so we can remove it.
884                             cursor.remove_inst_and_step_back();
885                         } else {
886                             if let Some(cmd) = ctx.optimize_skeleton_inst(inst, block) {
887                                 Self::execute_skeleton_inst_simplification(
888                                     cmd,
889                                     &mut cursor,
890                                     &mut value_to_opt_value,
891                                     inst,
892                                 );
893                             }
894                         }
895                     }
896                 }
897                 StackEntry::Pop => {
898                     gvn_map.decrement_depth();
899                     gvn_map_blocks.pop();
900                 }
901             }
902         }
903     }
904 
905     /// Execute a simplification of an instruction in the side-effectful
906     /// skeleton.
execute_skeleton_inst_simplification( simplification: SkeletonInstSimplification, cursor: &mut FuncCursor, value_to_opt_value: &mut SecondaryMap<Value, Value>, old_inst: Inst, )907     fn execute_skeleton_inst_simplification(
908         simplification: SkeletonInstSimplification,
909         cursor: &mut FuncCursor,
910         value_to_opt_value: &mut SecondaryMap<Value, Value>,
911         old_inst: Inst,
912     ) {
913         let mut forward_val = |cursor: &mut FuncCursor, old_val, new_val| {
914             cursor.func.dfg.change_to_alias(old_val, new_val);
915             value_to_opt_value[old_val] = new_val;
916         };
917 
918         let (new_inst, new_val) = match simplification {
919             SkeletonInstSimplification::Remove => {
920                 cursor.remove_inst_and_step_back();
921                 return;
922             }
923             SkeletonInstSimplification::RemoveWithVal { val } => {
924                 cursor.remove_inst_and_step_back();
925                 let old_val = cursor.func.dfg.first_result(old_inst);
926                 cursor.func.dfg.detach_inst_results(old_inst);
927                 forward_val(cursor, old_val, val);
928                 return;
929             }
930             SkeletonInstSimplification::Replace { inst } => (inst, None),
931             SkeletonInstSimplification::ReplaceWithVal { inst, val } => (inst, Some(val)),
932         };
933 
934         // Replace the old instruction with the new one.
935         cursor.replace_inst(new_inst);
936         debug_assert!(!cursor.func.dfg.insts[new_inst].opcode().is_terminator());
937 
938         // Redirect the old instruction's result values to our new instruction's
939         // result values.
940         let mut i = 0;
941         let mut next_new_val = |dfg: &crate::ir::DataFlowGraph| -> Value {
942             if let Some(val) = new_val {
943                 val
944             } else {
945                 let val = dfg.inst_results(new_inst)[i];
946                 i += 1;
947                 val
948             }
949         };
950         for i in 0..cursor.func.dfg.inst_results(old_inst).len() {
951             let old_val = cursor.func.dfg.inst_results(old_inst)[i];
952             let new_val = next_new_val(&cursor.func.dfg);
953             forward_val(cursor, old_val, new_val);
954         }
955 
956         // Back up so that the next iteration of the outer egraph loop will
957         // process the new instruction.
958         cursor.goto_inst(new_inst);
959         cursor.prev_inst();
960     }
961 
962     /// Scoped elaboration: compute a final ordering of op computation
963     /// for each block and update the given Func body. After this
964     /// runs, the function body is back into the state where every
965     /// Inst with an used result is placed in the layout (possibly
966     /// duplicated, if our code-motion logic decides this is the best
967     /// option).
968     ///
969     /// This works in concert with the domtree. We do a preorder
970     /// traversal of the domtree, tracking a scoped map from Id to
971     /// (new) Value. The map's scopes correspond to levels in the
972     /// domtree.
973     ///
974     /// At each block, we iterate forward over the side-effecting
975     /// eclasses, and recursively generate their arg eclasses, then
976     /// emit the ops themselves.
977     ///
978     /// To use an eclass in a given block, we first look it up in the
979     /// scoped map, and get the Value if already present. If not, we
980     /// need to generate it. We emit the extracted enode for this
981     /// eclass after recursively generating its args. Eclasses are
982     /// thus computed "as late as possible", but then memoized into
983     /// the Id-to-Value map and available to all dominated blocks and
984     /// for the rest of this block. (This subsumes GVN.)
elaborate(&mut self)985     fn elaborate(&mut self) {
986         let mut elaborator = Elaborator::new(
987             self.func,
988             &self.domtree,
989             self.loop_analysis,
990             &self.remat_values,
991             &mut self.stats,
992             self.ctrl_plane,
993         );
994         elaborator.elaborate();
995 
996         self.check_post_egraph();
997     }
998 
999     #[cfg(debug_assertions)]
check_post_egraph(&self)1000     fn check_post_egraph(&self) {
1001         // Verify that no union nodes are reachable from inst args,
1002         // and that all inst args' defining instructions are in the
1003         // layout.
1004         for block in self.func.layout.blocks() {
1005             for inst in self.func.layout.block_insts(block) {
1006                 self.func
1007                     .dfg
1008                     .inst_values(inst)
1009                     .for_each(|arg| match self.func.dfg.value_def(arg) {
1010                         ValueDef::Result(i, _) => {
1011                             debug_assert!(self.func.layout.inst_block(i).is_some());
1012                         }
1013                         ValueDef::Union(..) => {
1014                             panic!("egraph union node {arg} still reachable at {inst}!");
1015                         }
1016                         _ => {}
1017                     })
1018             }
1019         }
1020     }
1021 
1022     #[cfg(not(debug_assertions))]
check_post_egraph(&self)1023     fn check_post_egraph(&self) {}
1024 }
1025 
1026 /// Implementation of external-context equality and hashing on
1027 /// InstructionData. This allows us to deduplicate instructions given
1028 /// some context that lets us see its value lists, so we don't need to
1029 /// store arguments inline in the `InstructionData` (or alongside it in
1030 /// some newly-defined key type) in all cases.
1031 struct GVNContext<'a> {
1032     value_lists: &'a ValueListPool,
1033 }
1034 
1035 impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> {
ctx_eq( &self, (a_ty, a_inst): &(Type, InstructionData), (b_ty, b_inst): &(Type, InstructionData), ) -> bool1036     fn ctx_eq(
1037         &self,
1038         (a_ty, a_inst): &(Type, InstructionData),
1039         (b_ty, b_inst): &(Type, InstructionData),
1040     ) -> bool {
1041         a_ty == b_ty && a_inst.eq(b_inst, self.value_lists)
1042     }
1043 }
1044 
1045 impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> {
ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData))1046     fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) {
1047         core::hash::Hash::hash(&ty, state);
1048         inst.hash(state, self.value_lists);
1049     }
1050 }
1051 
1052 /// Statistics collected during egraph-based processing.
1053 #[derive(Clone, Debug, Default)]
1054 pub(crate) struct Stats {
1055     pub(crate) pure_inst: u64,
1056     pub(crate) pure_inst_deduped: u64,
1057     pub(crate) pure_inst_subsume: u64,
1058     pub(crate) pure_inst_rewrite_to_self: u64,
1059     pub(crate) pure_inst_insert_orig: u64,
1060     pub(crate) pure_inst_insert_new: u64,
1061     pub(crate) skeleton_inst: u64,
1062     pub(crate) skeleton_inst_simplified: u64,
1063     pub(crate) skeleton_inst_gvn: u64,
1064     pub(crate) alias_analysis_removed: u64,
1065     pub(crate) new_inst: u64,
1066     pub(crate) union: u64,
1067     pub(crate) subsume: u64,
1068     pub(crate) remat: u64,
1069     pub(crate) rewrite_rule_invoked: u64,
1070     pub(crate) rewrite_rule_results: u64,
1071     pub(crate) rewrite_depth_limit: u64,
1072     pub(crate) elaborate_visit_node: u64,
1073     pub(crate) elaborate_memoize_hit: u64,
1074     pub(crate) elaborate_memoize_miss: u64,
1075     pub(crate) elaborate_remat: u64,
1076     pub(crate) elaborate_licm_hoist: u64,
1077     pub(crate) elaborate_func: u64,
1078     pub(crate) elaborate_func_pre_insts: u64,
1079     pub(crate) elaborate_func_post_insts: u64,
1080     pub(crate) eclass_size_limit: u64,
1081 }
1082