1 //! Support for egraphs represented in the DataFlowGraph.
2 
3 use crate::FxHashSet;
4 use crate::alias_analysis::{AliasAnalysis, LastStores};
5 use crate::ctxhash::{CtxEq, CtxHash, NullCtx};
6 use crate::cursor::{Cursor, CursorPosition, FuncCursor};
7 use crate::dominator_tree::DominatorTree;
8 use crate::egraph::elaborate::Elaborator;
9 use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph};
10 use crate::ir::pcc::Fact;
11 use crate::ir::{
12     Block, DataFlowGraph, Function, Inst, InstructionData, Opcode, Type, Value, ValueDef,
13     ValueListPool,
14 };
15 use crate::loop_analysis::LoopAnalysis;
16 use crate::opts::IsleContext;
17 use crate::opts::generated_code::SkeletonInstSimplification;
18 use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap};
19 use crate::settings::Flags;
20 use crate::take_and_replace::TakeAndReplace;
21 use crate::trace;
22 use alloc::vec::Vec;
23 use core::cmp::Ordering;
24 use core::hash::Hasher;
25 use cranelift_control::ControlPlane;
26 use cranelift_entity::SecondaryMap;
27 use cranelift_entity::packed_option::ReservedValue;
28 use smallvec::SmallVec;
29 
30 mod cost;
31 mod elaborate;
32 
33 /// Pass over a Function that does the whole aegraph thing.
34 ///
35 /// - Removes non-skeleton nodes from the Layout.
36 /// - Performs a GVN-and-rule-application pass over all Values
37 ///   reachable from the skeleton, potentially creating new Union
38 ///   nodes (i.e., an aegraph) so that some values have multiple
39 ///   representations.
40 /// - Does "extraction" on the aegraph: selects the best value out of
41 ///   the tree-of-Union nodes for each used value.
42 /// - Does "scoped elaboration" on the aegraph: chooses one or more
43 ///   locations for pure nodes to become instructions again in the
44 ///   layout, as forced by the skeleton.
45 ///
46 /// At the beginning and end of this pass, the CLIF should be in a
47 /// state that passes the verifier and, additionally, has no Union
48 /// nodes. During the pass, Union nodes may exist, and instructions in
49 /// the layout may refer to results of instructions that are not
50 /// placed in the layout.
51 pub struct EgraphPass<'a> {
52     /// The function we're operating on.
53     func: &'a mut Function,
54     /// Dominator tree for the CFG, used to visit blocks in pre-order
55     /// so we see value definitions before their uses, and also used for
56     /// O(1) dominance checks.
57     domtree: &'a DominatorTree,
58     /// Alias analysis, used during optimization.
59     alias_analysis: &'a mut AliasAnalysis<'a>,
60     /// Loop analysis results, used for built-in LICM during
61     /// elaboration.
62     loop_analysis: &'a LoopAnalysis,
63     /// Compiler flags.
64     flags: &'a Flags,
65     /// Chaos-mode control-plane so we can test that we still get
66     /// correct results when our heuristics make bad decisions.
67     ctrl_plane: &'a mut ControlPlane,
68     /// Which Values do we want to rematerialize in each block where
69     /// they're used?
70     remat_values: FxHashSet<Value>,
71     /// Stats collected while we run this pass.
72     pub(crate) stats: Stats,
73 }
74 
75 /// The maximum number of rewrites we will take from a single call into ISLE.
76 const MATCHES_LIMIT: usize = 5;
77 
78 /// The maximum number of enodes in any given eclass.
79 const ECLASS_ENODE_LIMIT: usize = 5;
80 
81 /// Context passed through node insertion and optimization.
82 pub(crate) struct OptimizeCtx<'opt, 'analysis>
83 where
84     'analysis: 'opt,
85 {
86     // Borrowed from EgraphPass:
87     pub(crate) func: &'opt mut Function,
88     pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>,
89     available_block: &'opt mut SecondaryMap<Value, Block>,
90     eclass_size: &'opt mut SecondaryMap<Value, u8>,
91     pub(crate) gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Option<Value>>,
92     pub(crate) gvn_map_blocks: &'opt Vec<Block>,
93     pub(crate) remat_values: &'opt mut FxHashSet<Value>,
94     pub(crate) stats: &'opt mut Stats,
95     domtree: &'opt DominatorTree,
96     pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>,
97     pub(crate) alias_analysis_state: &'opt mut LastStores,
98     flags: &'opt Flags,
99     ctrl_plane: &'opt mut ControlPlane,
100     // Held locally during optimization of one node (recursively):
101     pub(crate) rewrite_depth: usize,
102     pub(crate) subsume_values: FxHashSet<Value>,
103     optimized_values: SmallVec<[Value; MATCHES_LIMIT]>,
104     optimized_insts: SmallVec<[SkeletonInstSimplification; MATCHES_LIMIT]>,
105 }
106 
107 /// For passing to `insert_pure_enode`. Sometimes the enode already
108 /// exists as an Inst (from the original CLIF), and sometimes we're in
109 /// the middle of creating it and want to avoid inserting it if
110 /// possible until we know we need it.
111 pub(crate) enum NewOrExistingInst {
112     New(InstructionData, Type),
113     Existing(Inst),
114 }
115 
116 impl NewOrExistingInst {
117     fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) {
118         match self {
119             NewOrExistingInst::New(data, ty) => (*ty, *data),
120             NewOrExistingInst::Existing(inst) => {
121                 let ty = dfg.ctrl_typevar(*inst);
122                 (ty, dfg.insts[*inst])
123             }
124         }
125     }
126 }
127 
128 impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis>
129 where
130     'analysis: 'opt,
131 {
132     /// Optimization of a single instruction.
133     ///
134     /// This does a few things:
135     /// - Looks up the instruction in the GVN deduplication map. If we
136     ///   already have the same instruction somewhere else, with the
137     ///   same args, then we can alias the original instruction's
138     ///   results and omit this instruction entirely.
139     /// - If the instruction is "new" (not deduplicated), then apply
140     ///   optimization rules:
141     ///   - All of the mid-end rules written in ISLE.
142     ///   - Store-to-load forwarding.
143     /// - Update the value-to-opt-value map, and update the eclass
144     ///   union-find, if we rewrote the value to different form(s).
145     pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value {
146         // Create the external context for looking up and updating the
147         // GVN map. This is necessary so that instructions themselves
148         // do not have to carry all the references or data for a full
149         // `Eq` or `Hash` impl.
150         let gvn_context = GVNContext {
151             value_lists: &self.func.dfg.value_lists,
152         };
153 
154         self.stats.pure_inst += 1;
155         if let NewOrExistingInst::New(..) = inst {
156             self.stats.new_inst += 1;
157         }
158 
159         // Does this instruction already exist? If so, add entries to
160         // the value-map to rewrite uses of its results to the results
161         // of the original (existing) instruction. If not, optimize
162         // the new instruction.
163         if let Some(&Some(orig_result)) = self
164             .gvn_map
165             .get(&gvn_context, &inst.get_inst_key(&self.func.dfg))
166         {
167             self.stats.pure_inst_deduped += 1;
168             if let NewOrExistingInst::Existing(inst) = inst {
169                 debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1);
170                 let result = self.func.dfg.first_result(inst);
171                 self.value_to_opt_value[result] = orig_result;
172                 self.available_block[result] = self.available_block[orig_result];
173                 self.func.dfg.merge_facts(result, orig_result);
174             }
175             orig_result
176         } else {
177             // Now actually insert the InstructionData and attach
178             // result value (exactly one).
179             let (inst, result, ty) = match inst {
180                 NewOrExistingInst::New(data, typevar) => {
181                     self.stats.pure_inst_insert_new += 1;
182                     let inst = self.func.dfg.make_inst(data);
183                     // TODO: reuse return value?
184                     self.func.dfg.make_inst_results(inst, typevar);
185                     let result = self.func.dfg.first_result(inst);
186                     // New inst. We need to do the analysis of its result.
187                     (inst, result, typevar)
188                 }
189                 NewOrExistingInst::Existing(inst) => {
190                     self.stats.pure_inst_insert_orig += 1;
191                     let result = self.func.dfg.first_result(inst);
192                     let ty = self.func.dfg.ctrl_typevar(inst);
193                     (inst, result, ty)
194                 }
195             };
196 
197             self.attach_constant_fact(inst, result, ty);
198 
199             self.available_block[result] = self.get_available_block(inst);
200             let opt_value = self.optimize_pure_enode(inst);
201             log::trace!("optimizing inst {inst} orig result {result} gave {opt_value}");
202 
203             let gvn_context = GVNContext {
204                 value_lists: &self.func.dfg.value_lists,
205             };
206             // Insert at level implied by args. This enables merging
207             // in LICM cases like:
208             //
209             // while (...) {
210             //   if (...) {
211             //     let x = loop_invariant_expr;
212             //   }
213             //   if (...) {
214             //     let x = loop_invariant_expr;
215             //   }
216             // }
217             //
218             // where the two instances of the expression otherwise
219             // wouldn't merge because each would be in a separate
220             // subscope of the scoped hashmap during traversal.
221             log::trace!(
222                 "value {} is available at {}",
223                 opt_value,
224                 self.available_block[opt_value]
225             );
226             let depth = self.depth_of_block_in_gvn_map(self.available_block[opt_value]);
227             self.gvn_map.insert_with_depth(
228                 &gvn_context,
229                 (ty, self.func.dfg.insts[inst]),
230                 Some(opt_value),
231                 depth,
232             );
233             self.value_to_opt_value[result] = opt_value;
234             opt_value
235         }
236     }
237 
238     /// Find the block where a pure instruction first becomes available,
239     /// defined as the block that is closest to the root where all of
240     /// its arguments are available. In the unusual case where a pure
241     /// instruction has no arguments (e.g. get_return_address), we can
242     /// place it anywhere, so it is available in the entry block.
243     ///
244     /// This function does not compute available blocks recursively.
245     /// All of the instruction's arguments must have had their available
246     /// blocks assigned already.
247     fn get_available_block(&self, inst: Inst) -> Block {
248         // Side-effecting instructions have different rules for where
249         // they become available, so this function does not apply.
250         debug_assert!(is_pure_for_egraph(self.func, inst));
251 
252         // Note that the def-point of all arguments to an instruction
253         // in SSA lie on a line of direct ancestors in the domtree, and
254         // so do their available-blocks. This means that for any pair of
255         // arguments, their available blocks are either the same or one
256         // strictly dominates the other. We just need to find any argument
257         // whose available block is deepest in the domtree.
258         self.func.dfg.insts[inst]
259             .arguments(&self.func.dfg.value_lists)
260             .iter()
261             .map(|&v| {
262                 let block = self.available_block[v];
263                 debug_assert!(!block.is_reserved_value());
264                 block
265             })
266             .max_by(|&x, &y| {
267                 if self.domtree.block_dominates(x, y) {
268                     Ordering::Less
269                 } else {
270                     debug_assert!(self.domtree.block_dominates(y, x));
271                     Ordering::Greater
272                 }
273             })
274             .unwrap_or(self.func.layout.entry_block().unwrap())
275     }
276 
277     fn depth_of_block_in_gvn_map(&self, block: Block) -> usize {
278         log::trace!(
279             "finding depth of available block {} in domtree stack: {:?}",
280             block,
281             self.gvn_map_blocks
282         );
283         self.gvn_map_blocks
284             .iter()
285             .enumerate()
286             .rev()
287             .find(|&(_, b)| *b == block)
288             .unwrap()
289             .0
290     }
291 
292     /// Optimizes an enode by applying any matching mid-end rewrite
293     /// rules (or store-to-load forwarding, which is a special case),
294     /// unioning together all possible optimized (or rewritten) forms
295     /// of this expression into an eclass and returning the `Value`
296     /// that represents that eclass.
297     fn optimize_pure_enode(&mut self, inst: Inst) -> Value {
298         // A pure node always has exactly one result.
299         let orig_value = self.func.dfg.first_result(inst);
300 
301         let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_values);
302         let (ctx, optimized_values) = guard.get();
303 
304         // Limit rewrite depth. When we apply optimization rules, they
305         // may create new nodes (values) and those are, recursively,
306         // optimized eagerly as soon as they are created. So we may
307         // have more than one ISLE invocation on the stack. (This is
308         // necessary so that as the toplevel builds the
309         // right-hand-side expression bottom-up, it uses the "latest"
310         // optimized values for all the constituent parts.) To avoid
311         // infinite or problematic recursion, we bound the rewrite
312         // depth to a small constant here.
313         const REWRITE_LIMIT: usize = 5;
314         if ctx.rewrite_depth > REWRITE_LIMIT {
315             ctx.stats.rewrite_depth_limit += 1;
316             return orig_value;
317         }
318         ctx.rewrite_depth += 1;
319         trace!("Incrementing rewrite depth; now {}", ctx.rewrite_depth);
320 
321         // Invoke the ISLE toplevel constructor, getting all new
322         // values produced as equivalents to this value.
323         trace!("Calling into ISLE with original value {}", orig_value);
324         ctx.stats.rewrite_rule_invoked += 1;
325         debug_assert!(optimized_values.is_empty());
326         crate::opts::generated_code::constructor_simplify(
327             &mut IsleContext { ctx },
328             orig_value,
329             optimized_values,
330         );
331 
332         ctx.stats.rewrite_rule_results += optimized_values.len() as u64;
333 
334         // It's not supposed to matter what order `simplify` returns values in.
335         ctx.ctrl_plane.shuffle(optimized_values);
336 
337         let num_matches = optimized_values.len();
338         if num_matches > MATCHES_LIMIT {
339             trace!(
340                 "Reached maximum matches limit; too many optimized values \
341                  ({num_matches} > {MATCHES_LIMIT}); ignoring rest.",
342             );
343             optimized_values.truncate(MATCHES_LIMIT);
344         }
345 
346         // Sort and deduplicate optimized values, in case multiple
347         // rules produced the same simplification.
348         optimized_values.sort_unstable();
349         optimized_values.dedup();
350 
351         trace!("  -> returned from ISLE: {orig_value} -> {optimized_values:?}");
352 
353         // Construct a union-node tree representing the new eclass
354         // that results from rewriting. If any returned value was
355         // marked "subsume", take only that value. Otherwise,
356         // sequentially build the chain over the original value and
357         // all returned values.
358         let result_value = if let Some(&subsuming_value) = optimized_values
359             .iter()
360             .find(|&value| ctx.subsume_values.contains(value))
361         {
362             optimized_values.clear();
363             ctx.stats.pure_inst_subsume += 1;
364             subsuming_value
365         } else {
366             let mut union_value = orig_value;
367             let mut eclass_size = ctx.eclass_size[orig_value] + 1;
368             for optimized_value in optimized_values.drain(..) {
369                 trace!(
370                     "Returned from ISLE for {}, got {:?}",
371                     orig_value, optimized_value
372                 );
373                 if optimized_value == orig_value {
374                     trace!(" -> same as orig value; skipping");
375                     ctx.stats.pure_inst_rewrite_to_self += 1;
376                     continue;
377                 }
378                 let rhs_eclass_size = ctx.eclass_size[optimized_value] + 1;
379                 if usize::from(eclass_size) + usize::from(rhs_eclass_size) > ECLASS_ENODE_LIMIT {
380                     trace!(" -> reached eclass size limit");
381                     ctx.stats.eclass_size_limit += 1;
382                     break;
383                 }
384                 let old_union_value = union_value;
385                 union_value = ctx.func.dfg.union(old_union_value, optimized_value);
386                 eclass_size += rhs_eclass_size;
387                 ctx.eclass_size[union_value] = eclass_size - 1;
388                 ctx.stats.union += 1;
389                 trace!(" -> union: now {}", union_value);
390                 ctx.func.dfg.merge_facts(old_union_value, optimized_value);
391                 ctx.available_block[union_value] =
392                     ctx.merge_availability(old_union_value, optimized_value);
393             }
394             union_value
395         };
396 
397         ctx.rewrite_depth -= 1;
398         trace!("Decrementing rewrite depth; now {}", ctx.rewrite_depth);
399         if ctx.rewrite_depth == 0 {
400             ctx.subsume_values.clear();
401         }
402 
403         debug_assert!(ctx.optimized_values.is_empty());
404         result_value
405     }
406 
407     fn merge_availability(&self, a: Value, b: Value) -> Block {
408         let a = self.available_block[a];
409         let b = self.available_block[b];
410         if self.domtree.block_dominates(a, b) {
411             a
412         } else {
413             b
414         }
415     }
416 
417     /// Optimize a "skeleton" instruction.
418     ///
419     /// Returns an optional command of how to continue processing the optimized
420     /// instruction (e.g. removing it or replacing it with a new instruction).
421     fn optimize_skeleton_inst(
422         &mut self,
423         inst: Inst,
424         block: Block,
425     ) -> Option<SkeletonInstSimplification> {
426         self.stats.skeleton_inst += 1;
427 
428         // If we have a rewrite rule for this instruction, do that first, so
429         // that GVN and alias analysis only see simplified skeleton
430         // instructions.
431         if let Some(cmd) = self.simplify_skeleton_inst(inst) {
432             self.stats.skeleton_inst_simplified += 1;
433             return Some(cmd);
434         }
435 
436         // First, can we try to deduplicate? We need to keep some copy
437         // of the instruction around because it's side-effecting, but
438         // we may be able to reuse an earlier instance of it.
439         if is_mergeable_for_egraph(self.func, inst) {
440             let result = self.func.dfg.inst_results(inst).get(0).copied();
441             trace!(" -> mergeable side-effecting op {}", inst);
442 
443             // Does this instruction already exist? If so, add entries to
444             // the value-map to rewrite uses of its results to the results
445             // of the original (existing) instruction. If not, optimize
446             // the new instruction.
447             //
448             // Note that the GVN map is scoped, which is important
449             // here: because effectful ops are not removed from the
450             // skeleton (`Layout`), we need to be mindful of whether
451             // our current position is dominated by an instance of the
452             // instruction. (See #5796 for details.)
453             let ty = self.func.dfg.ctrl_typevar(inst);
454             match self
455                 .gvn_map
456                 .entry(&NullCtx, (ty, self.func.dfg.insts[inst]))
457             {
458                 ScopedEntry::Occupied(o) => {
459                     let orig_result = *o.get();
460                     match (result, orig_result) {
461                         (Some(result), Some(orig_result)) => {
462                             // Hit in GVN map -- reuse value.
463                             self.stats.skeleton_inst_gvn += 1;
464                             self.value_to_opt_value[result] = orig_result;
465                             self.available_block[result] = self.available_block[orig_result];
466                             trace!(" -> merges result {} to {}", result, orig_result);
467                         }
468                         (None, None) => {
469                             // Hit in the GVN map, but the instruction doesn't
470                             // produce results, only side effects. Nothing else
471                             // to do here.
472                             self.stats.skeleton_inst_gvn += 1;
473                             trace!(" -> merges with dominating instruction");
474                         }
475                         (_, _) => unreachable!(),
476                     }
477                     Some(SkeletonInstSimplification::Remove)
478                 }
479                 ScopedEntry::Vacant(v) => {
480                     // Otherwise, insert it into the value-map.
481                     if let Some(result) = result {
482                         self.value_to_opt_value[result] = result;
483                         self.available_block[result] = block;
484                     }
485                     v.insert(result);
486                     trace!(" -> inserts as new (no GVN)");
487                     None
488                 }
489             }
490         }
491         // Otherwise, if a load or store, process it with the alias
492         // analysis to see if we can optimize it (rewrite in terms of
493         // an earlier load or stored value).
494         else if let Some(new_result) =
495             self.alias_analysis
496                 .process_inst(self.func, self.alias_analysis_state, inst)
497         {
498             self.stats.alias_analysis_removed += 1;
499             let result = self.func.dfg.first_result(inst);
500             trace!(
501                 " -> inst {} has result {} replaced with {}",
502                 inst, result, new_result
503             );
504             self.value_to_opt_value[result] = new_result;
505             self.available_block[result] = self.available_block[new_result];
506             self.func.dfg.merge_facts(result, new_result);
507             Some(SkeletonInstSimplification::Remove)
508         }
509         // Otherwise, generic side-effecting op -- always keep it, and
510         // set its results to identity-map to original values.
511         else {
512             // Set all results to identity-map to themselves
513             // in the value-to-opt-value map.
514             for &result in self.func.dfg.inst_results(inst) {
515                 self.value_to_opt_value[result] = result;
516                 self.available_block[result] = block;
517             }
518             None
519         }
520     }
521 
522     /// Find the best simplification of the given skeleton instruction, if any,
523     /// by consulting our `simplify_skeleton` ISLE rules.
524     fn simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification> {
525         // We cannot currently simplify terminators, or simplify into
526         // terminators. Anything that could change the control-flow graph is off
527         // limits.
528         //
529         // Consider the following CLIF snippet:
530         //
531         //     block0(v0: i64):
532         //         v1 = iconst.i32 0
533         //         trapz v1, user42
534         //         v2 = load.i32 v0
535         //         brif v1, block1, block2
536         //     block1:
537         //         return v2
538         //     block2:
539         //         v3 = iconst.i32 1
540         //         v4 = iadd v2, v3
541         //         return v4
542         //
543         // We would ideally like to perform simplifications like replacing the
544         // `trapz` with an unconditional `trap` and the conditional `brif`
545         // branch with an unconditional `jump`. Note, however, that blocks
546         // `block1` and `block2` are dominated by `block0` and therefore can and
547         // do use values defined in `block0`. This presents challenges:
548         //
549         // * If we replace the `brif` with a `jump`, then we've mutated the
550         //   control-flow graph and removed that domination property. The uses
551         //   of `v2` and `v3` in those blocks become invalid.
552         //
553         // * Even worse, if we turn the `trapz` into a `trap`, we are
554         //   introducing a terminator into the middle of the block, which leaves
555         //   us with two choices to fix up the IR so that there aren't any
556         //   instructions following the terminator in the block:
557         //
558         //   1. We can split the unreachable instructions off into a new
559         //      block. However, there is no control-flow edge from the current
560         //      block to this new block and so, again, the new block isn't
561         //      dominated by the current block, and therefore the can't use
562         //      values defined in this block or any dominating it. The `load`
563         //      instruction uses `v0` but is not dominated by `v0`'s
564         //      definition.
565         //
566         //   2. Alternatively, we can simply delete the trailing instructions,
567         //      since they are unreachable. But then not only are the old
568         //      instructions' uses no longer dominated by their definitions, but
569         //      the definitions do not exist at all anymore!
570         //
571         // Whatever approach we would take, we would invalidate value uses, and
572         // would need to track and fix them up.
573         if self.func.dfg.insts[inst].opcode().is_branch() {
574             return None;
575         }
576 
577         let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_insts);
578         let (ctx, optimized_insts) = guard.get();
579 
580         crate::opts::generated_code::constructor_simplify_skeleton(
581             &mut IsleContext { ctx },
582             inst,
583             optimized_insts,
584         );
585 
586         let simplifications_len = optimized_insts.len();
587         log::trace!(" -> simplify_skeleton: yielded {simplifications_len} simplification(s)");
588         if simplifications_len > MATCHES_LIMIT {
589             log::trace!("      too many candidate simplifications; truncating to {MATCHES_LIMIT}");
590             optimized_insts.truncate(MATCHES_LIMIT);
591         }
592 
593         // Find the best simplification, if any, from our candidates.
594         //
595         // Unlike simplifying pure values, we do not add side-effectful
596         // instructions to the egraph, nor do we extract the best version via
597         // dynamic programming and considering the costs of operands. Instead,
598         // we greedily choose the best simplification. This is because there is
599         // an impedance mismatch: the egraph and our pure rewrites are centered
600         // around *values*, but we don't represent side-effects with values, we
601         // represent them implicitly in their *instructions*.
602         //
603         // The initial best choice is "no simplification, just use the original
604         // instruction" which has the original instruction's cost.
605         let mut best = None;
606         let mut best_cost = cost::Cost::of_skeleton_op(
607             ctx.func.dfg.insts[inst].opcode(),
608             ctx.func.dfg.inst_args(inst).len(),
609         );
610         while let Some(simplification) = optimized_insts.pop() {
611             let (new_inst, new_val) = match simplification {
612                 // We can't do better than completely removing the skeleton
613                 // instruction, so short-cicuit the loop and eagerly return the
614                 // `Remove*` simplifications.
615                 SkeletonInstSimplification::Remove => {
616                     log::trace!(" -> simplify_skeleton: remove inst");
617                     debug_assert!(ctx.func.dfg.inst_results(inst).is_empty());
618                     return Some(simplification);
619                 }
620                 SkeletonInstSimplification::RemoveWithVal { val } => {
621                     log::trace!(" -> simplify_skeleton: remove inst and use {val} as its result");
622                     if cfg!(debug_assertions) {
623                         let results = ctx.func.dfg.inst_results(inst);
624                         debug_assert_eq!(results.len(), 1);
625                         debug_assert_eq!(
626                             ctx.func.dfg.value_type(results[0]),
627                             ctx.func.dfg.value_type(val),
628                         );
629                     }
630                     return Some(simplification);
631                 }
632 
633                 // For instruction replacement simplification, we want to check
634                 // that the replacements define the same number and types of
635                 // values as the original instruction, and also determine
636                 // whether they are actually an improvement over (i.e. have
637                 // lower cost than) the original instruction.
638                 SkeletonInstSimplification::Replace { inst } => {
639                     log::trace!(
640                         " -> simplify_skeleton: replace inst with {inst}: {}",
641                         ctx.func.dfg.display_inst(inst)
642                     );
643                     (inst, None)
644                 }
645                 SkeletonInstSimplification::ReplaceWithVal { inst, val } => {
646                     log::trace!(
647                         " -> simplify_skeleton: replace inst with {val} and {inst}: {}",
648                         ctx.func.dfg.display_inst(inst)
649                     );
650                     (inst, Some(val))
651                 }
652             };
653 
654             if cfg!(debug_assertions) {
655                 let opcode = ctx.func.dfg.insts[inst].opcode();
656                 debug_assert!(
657                     !(opcode.is_terminator() || opcode.is_branch()),
658                     "simplifying control-flow instructions and terminators is not yet supported",
659                 );
660 
661                 let old_vals = ctx.func.dfg.inst_results(inst);
662                 let new_vals = if let Some(val) = new_val.as_ref() {
663                     core::slice::from_ref(val)
664                 } else {
665                     ctx.func.dfg.inst_results(new_inst)
666                 };
667                 debug_assert_eq!(
668                     old_vals.len(),
669                     new_vals.len(),
670                     "skeleton simplification should result in the same number of result values",
671                 );
672 
673                 for (old_val, new_val) in old_vals.iter().zip(new_vals) {
674                     let old_ty = ctx.func.dfg.value_type(*old_val);
675                     let new_ty = ctx.func.dfg.value_type(*new_val);
676                     debug_assert_eq!(
677                         old_ty, new_ty,
678                         "skeleton simplification should result in values of the correct type",
679                     );
680                 }
681             }
682 
683             // Our best simplification is the one with the least cost. Update
684             // `best` if necessary.
685             let cost = cost::Cost::of_skeleton_op(
686                 ctx.func.dfg.insts[new_inst].opcode(),
687                 ctx.func.dfg.inst_args(new_inst).len(),
688             );
689             if cost < best_cost {
690                 best = Some(simplification);
691                 best_cost = cost;
692             }
693         }
694 
695         // Return the best simplification!
696         best
697     }
698 
699     /// Helper to propagate facts on constant values: if PCC is
700     /// enabled, then unconditionally add a fact attesting to the
701     /// Value's concrete value.
702     fn attach_constant_fact(&mut self, inst: Inst, value: Value, ty: Type) {
703         if self.flags.enable_pcc() {
704             if let InstructionData::UnaryImm {
705                 opcode: Opcode::Iconst,
706                 imm,
707             } = self.func.dfg.insts[inst]
708             {
709                 let imm: i64 = imm.into();
710                 self.func.dfg.facts[value] =
711                     Some(Fact::constant(ty.bits().try_into().unwrap(), imm as u64));
712             }
713         }
714     }
715 }
716 
717 impl<'a> EgraphPass<'a> {
718     /// Create a new EgraphPass.
719     pub fn new(
720         func: &'a mut Function,
721         domtree: &'a DominatorTree,
722         loop_analysis: &'a LoopAnalysis,
723         alias_analysis: &'a mut AliasAnalysis<'a>,
724         flags: &'a Flags,
725         ctrl_plane: &'a mut ControlPlane,
726     ) -> Self {
727         Self {
728             func,
729             domtree,
730             loop_analysis,
731             alias_analysis,
732             flags,
733             ctrl_plane,
734             stats: Stats::default(),
735             remat_values: FxHashSet::default(),
736         }
737     }
738 
739     /// Run the process.
740     pub fn run(&mut self) {
741         self.remove_pure_and_optimize();
742 
743         trace!("egraph built:\n{}\n", self.func.display());
744         if cfg!(feature = "trace-log") {
745             for (value, def) in self.func.dfg.values_and_defs() {
746                 trace!(" -> {} = {:?}", value, def);
747                 match def {
748                     ValueDef::Result(i, 0) => {
749                         trace!("  -> {} = {:?}", i, self.func.dfg.insts[i]);
750                     }
751                     _ => {}
752                 }
753             }
754         }
755 
756         self.elaborate();
757 
758         log::trace!("stats: {:#?}", self.stats);
759     }
760 
761     /// Remove pure nodes from the `Layout` of the function, ensuring
762     /// that only the "side-effect skeleton" remains, and also
763     /// optimize the pure nodes. This is the first step of
764     /// egraph-based processing and turns the pure CFG-based CLIF into
765     /// a CFG skeleton with a sea of (optimized) nodes tying it
766     /// together.
767     ///
768     /// As we walk through the code, we eagerly apply optimization
769     /// rules; at any given point we have a "latest version" of an
770     /// eclass of possible representations for a `Value` in the
771     /// original program, which is itself a `Value` at the root of a
772     /// union-tree. We keep a map from the original values to these
773     /// optimized values. When we encounter any instruction (pure or
774     /// side-effecting skeleton) we rewrite its arguments to capture
775     /// the "latest" optimized forms of these values. (We need to do
776     /// this as part of this pass, and not later using a finished map,
777     /// because the eclass can continue to be updated and we need to
778     /// only refer to its subset that exists at this stage, to
779     /// maintain acyclicity.)
780     fn remove_pure_and_optimize(&mut self) {
781         let mut cursor = FuncCursor::new(self.func);
782         let mut value_to_opt_value: SecondaryMap<Value, Value> =
783             SecondaryMap::with_default(Value::reserved_value());
784 
785         // Map from instruction to value for hash-consing of pure ops
786         // into the egraph. This can be a standard (non-scoped)
787         // hashmap because pure ops have no location: they are
788         // "outside of" control flow.
789         //
790         // Note also that we keep the controlling typevar (the `Type`
791         // in the tuple below) because it may disambiguate
792         // instructions that are identical except for type.
793         //
794         // We store both skeleton and non-skeleton instructions in the
795         // GVN map; for skeleton instructions, we only store those
796         // that are idempotent, i.e., still eligible to GVN. Note that
797         // some skeleton instructions are idempotent but do not
798         // produce a value: e.g., traps on a given condition. To allow
799         // for both cases, we store an `Option<Value>` as the value in
800         // this map.
801         let mut gvn_map: ScopedHashMap<(Type, InstructionData), Option<Value>> =
802             ScopedHashMap::with_capacity(cursor.func.dfg.num_values());
803 
804         // The block in the domtree preorder traversal at each level
805         // of the GVN map.
806         let mut gvn_map_blocks: Vec<Block> = vec![];
807 
808         // To get the best possible merging and canonicalization, we
809         // track where a value is "available" at: this is the
810         // domtree-nearest-ancestor join of all args if the value
811         // itself is pure, otherwise the block where the value is
812         // defined. (And for union nodes, the
813         // domtree-highest-ancestor, i.e., the meet or the dual to the
814         // above join.)
815         let mut available_block: SecondaryMap<Value, Block> =
816             SecondaryMap::with_default(Block::reserved_value());
817 
818         // To avoid blowing up eclasses too much, we track the size of
819         // each eclass reachable by a tree of union nodes from a given
820         // value ID, and we avoid union'ing additional values into an
821         // eclass when it reaches `ECLASS_ENODE_LIMIT`.
822         //
823         // For efficiency, this encodes size minus one: so a value of
824         // zero (which is cheap to bulk-initialize) means a singleton
825         // eclass of size one. This also allows us to avoid explicitly
826         // writing the size for any values that are not union nodes.
827         let mut eclass_size: SecondaryMap<Value, u8> = SecondaryMap::with_default(0);
828 
829         // This is an initial guess at the size we'll need, but we add
830         // more values as we build simplified alternative expressions so
831         // this is likely to realloc again later.
832         available_block.resize(cursor.func.dfg.num_values());
833 
834         // In domtree preorder, visit blocks. (TODO: factor out an
835         // iterator from this and elaborator.)
836         let root = cursor.layout().entry_block().unwrap();
837         enum StackEntry {
838             Visit(Block),
839             Pop,
840         }
841         let mut block_stack = vec![StackEntry::Visit(root)];
842         while let Some(entry) = block_stack.pop() {
843             match entry {
844                 StackEntry::Visit(block) => {
845                     // We popped this block; push children
846                     // immediately, then process this block.
847                     block_stack.push(StackEntry::Pop);
848                     block_stack.extend(
849                         self.ctrl_plane
850                             .shuffled(self.domtree.children(block))
851                             .map(StackEntry::Visit),
852                     );
853                     gvn_map.increment_depth();
854                     gvn_map_blocks.push(block);
855 
856                     trace!("Processing block {}", block);
857                     cursor.set_position(CursorPosition::Before(block));
858 
859                     let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);
860 
861                     for &param in cursor.func.dfg.block_params(block) {
862                         trace!("creating initial singleton eclass for blockparam {}", param);
863                         value_to_opt_value[param] = param;
864                         available_block[param] = block;
865                     }
866                     while let Some(inst) = cursor.next_inst() {
867                         trace!(
868                             "Processing inst {inst}: {}",
869                             cursor.func.dfg.display_inst(inst),
870                         );
871 
872                         // Rewrite args of *all* instructions using the
873                         // value-to-opt-value map.
874                         cursor.func.dfg.map_inst_values(inst, |arg| {
875                             let new_value = value_to_opt_value[arg];
876                             trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
877                             debug_assert_ne!(new_value, Value::reserved_value());
878                             new_value
879                         });
880 
881                         // Build a context for optimization, with borrows of
882                         // state. We can't invoke a method on `self` because
883                         // we've borrowed `self.func` mutably (as
884                         // `cursor.func`) so we pull apart the pieces instead
885                         // here.
886                         let mut ctx = OptimizeCtx {
887                             func: cursor.func,
888                             value_to_opt_value: &mut value_to_opt_value,
889                             gvn_map: &mut gvn_map,
890                             gvn_map_blocks: &mut gvn_map_blocks,
891                             available_block: &mut available_block,
892                             eclass_size: &mut eclass_size,
893                             rewrite_depth: 0,
894                             subsume_values: FxHashSet::default(),
895                             remat_values: &mut self.remat_values,
896                             stats: &mut self.stats,
897                             domtree: &self.domtree,
898                             alias_analysis: self.alias_analysis,
899                             alias_analysis_state: &mut alias_analysis_state,
900                             flags: self.flags,
901                             ctrl_plane: self.ctrl_plane,
902                             optimized_values: Default::default(),
903                             optimized_insts: Default::default(),
904                         };
905 
906                         if is_pure_for_egraph(ctx.func, inst) {
907                             // Insert into GVN map and optimize any new nodes
908                             // inserted (recursively performing this work for
909                             // any nodes the optimization rules produce).
910                             let inst = NewOrExistingInst::Existing(inst);
911                             ctx.insert_pure_enode(inst);
912                             // We've now rewritten all uses, or will when we
913                             // see them, and the instruction exists as a pure
914                             // enode in the eclass, so we can remove it.
915                             cursor.remove_inst_and_step_back();
916                         } else {
917                             if let Some(cmd) = ctx.optimize_skeleton_inst(inst, block) {
918                                 Self::execute_skeleton_inst_simplification(
919                                     cmd,
920                                     &mut cursor,
921                                     &mut value_to_opt_value,
922                                     inst,
923                                 );
924                             }
925                         }
926                     }
927                 }
928                 StackEntry::Pop => {
929                     gvn_map.decrement_depth();
930                     gvn_map_blocks.pop();
931                 }
932             }
933         }
934     }
935 
936     /// Execute a simplification of an instruction in the side-effectful
937     /// skeleton.
938     fn execute_skeleton_inst_simplification(
939         simplification: SkeletonInstSimplification,
940         cursor: &mut FuncCursor,
941         value_to_opt_value: &mut SecondaryMap<Value, Value>,
942         old_inst: Inst,
943     ) {
944         let mut forward_val = |cursor: &mut FuncCursor, old_val, new_val| {
945             cursor.func.dfg.change_to_alias(old_val, new_val);
946             value_to_opt_value[old_val] = new_val;
947         };
948 
949         let (new_inst, new_val) = match simplification {
950             SkeletonInstSimplification::Remove => {
951                 cursor.remove_inst_and_step_back();
952                 return;
953             }
954             SkeletonInstSimplification::RemoveWithVal { val } => {
955                 cursor.remove_inst_and_step_back();
956                 let old_val = cursor.func.dfg.first_result(old_inst);
957                 cursor.func.dfg.detach_inst_results(old_inst);
958                 forward_val(cursor, old_val, val);
959                 return;
960             }
961             SkeletonInstSimplification::Replace { inst } => (inst, None),
962             SkeletonInstSimplification::ReplaceWithVal { inst, val } => (inst, Some(val)),
963         };
964 
965         // Replace the old instruction with the new one.
966         cursor.replace_inst(new_inst);
967         debug_assert!(!cursor.func.dfg.insts[new_inst].opcode().is_terminator());
968 
969         // Redirect the old instruction's result values to our new instruction's
970         // result values.
971         let mut i = 0;
972         let mut next_new_val = |dfg: &crate::ir::DataFlowGraph| -> Value {
973             if let Some(val) = new_val {
974                 val
975             } else {
976                 let val = dfg.inst_results(new_inst)[i];
977                 i += 1;
978                 val
979             }
980         };
981         for i in 0..cursor.func.dfg.inst_results(old_inst).len() {
982             let old_val = cursor.func.dfg.inst_results(old_inst)[i];
983             let new_val = next_new_val(&cursor.func.dfg);
984             forward_val(cursor, old_val, new_val);
985         }
986 
987         // Back up so that the next iteration of the outer egraph loop will
988         // process the new instruction.
989         cursor.goto_inst(new_inst);
990         cursor.prev_inst();
991     }
992 
993     /// Scoped elaboration: compute a final ordering of op computation
994     /// for each block and update the given Func body. After this
995     /// runs, the function body is back into the state where every
996     /// Inst with an used result is placed in the layout (possibly
997     /// duplicated, if our code-motion logic decides this is the best
998     /// option).
999     ///
1000     /// This works in concert with the domtree. We do a preorder
1001     /// traversal of the domtree, tracking a scoped map from Id to
1002     /// (new) Value. The map's scopes correspond to levels in the
1003     /// domtree.
1004     ///
1005     /// At each block, we iterate forward over the side-effecting
1006     /// eclasses, and recursively generate their arg eclasses, then
1007     /// emit the ops themselves.
1008     ///
1009     /// To use an eclass in a given block, we first look it up in the
1010     /// scoped map, and get the Value if already present. If not, we
1011     /// need to generate it. We emit the extracted enode for this
1012     /// eclass after recursively generating its args. Eclasses are
1013     /// thus computed "as late as possible", but then memoized into
1014     /// the Id-to-Value map and available to all dominated blocks and
1015     /// for the rest of this block. (This subsumes GVN.)
1016     fn elaborate(&mut self) {
1017         let mut elaborator = Elaborator::new(
1018             self.func,
1019             &self.domtree,
1020             self.loop_analysis,
1021             &self.remat_values,
1022             &mut self.stats,
1023             self.ctrl_plane,
1024         );
1025         elaborator.elaborate();
1026 
1027         self.check_post_egraph();
1028     }
1029 
1030     #[cfg(debug_assertions)]
1031     fn check_post_egraph(&self) {
1032         // Verify that no union nodes are reachable from inst args,
1033         // and that all inst args' defining instructions are in the
1034         // layout.
1035         for block in self.func.layout.blocks() {
1036             for inst in self.func.layout.block_insts(block) {
1037                 self.func
1038                     .dfg
1039                     .inst_values(inst)
1040                     .for_each(|arg| match self.func.dfg.value_def(arg) {
1041                         ValueDef::Result(i, _) => {
1042                             debug_assert!(self.func.layout.inst_block(i).is_some());
1043                         }
1044                         ValueDef::Union(..) => {
1045                             panic!("egraph union node {arg} still reachable at {inst}!");
1046                         }
1047                         _ => {}
1048                     })
1049             }
1050         }
1051     }
1052 
1053     #[cfg(not(debug_assertions))]
1054     fn check_post_egraph(&self) {}
1055 }
1056 
1057 /// Implementation of external-context equality and hashing on
1058 /// InstructionData. This allows us to deduplicate instructions given
1059 /// some context that lets us see its value lists, so we don't need to
1060 /// store arguments inline in the `InstructionData` (or alongside it in
1061 /// some newly-defined key type) in all cases.
1062 struct GVNContext<'a> {
1063     value_lists: &'a ValueListPool,
1064 }
1065 
1066 impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> {
1067     fn ctx_eq(
1068         &self,
1069         (a_ty, a_inst): &(Type, InstructionData),
1070         (b_ty, b_inst): &(Type, InstructionData),
1071     ) -> bool {
1072         a_ty == b_ty && a_inst.eq(b_inst, self.value_lists)
1073     }
1074 }
1075 
1076 impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> {
1077     fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) {
1078         core::hash::Hash::hash(&ty, state);
1079         inst.hash(state, self.value_lists);
1080     }
1081 }
1082 
1083 /// Statistics collected during egraph-based processing.
1084 #[derive(Clone, Debug, Default)]
1085 pub(crate) struct Stats {
1086     pub(crate) pure_inst: u64,
1087     pub(crate) pure_inst_deduped: u64,
1088     pub(crate) pure_inst_subsume: u64,
1089     pub(crate) pure_inst_rewrite_to_self: u64,
1090     pub(crate) pure_inst_insert_orig: u64,
1091     pub(crate) pure_inst_insert_new: u64,
1092     pub(crate) skeleton_inst: u64,
1093     pub(crate) skeleton_inst_simplified: u64,
1094     pub(crate) skeleton_inst_gvn: u64,
1095     pub(crate) alias_analysis_removed: u64,
1096     pub(crate) new_inst: u64,
1097     pub(crate) union: u64,
1098     pub(crate) subsume: u64,
1099     pub(crate) remat: u64,
1100     pub(crate) rewrite_rule_invoked: u64,
1101     pub(crate) rewrite_rule_results: u64,
1102     pub(crate) rewrite_depth_limit: u64,
1103     pub(crate) elaborate_visit_node: u64,
1104     pub(crate) elaborate_memoize_hit: u64,
1105     pub(crate) elaborate_memoize_miss: u64,
1106     pub(crate) elaborate_remat: u64,
1107     pub(crate) elaborate_licm_hoist: u64,
1108     pub(crate) elaborate_func: u64,
1109     pub(crate) elaborate_func_pre_insts: u64,
1110     pub(crate) elaborate_func_post_insts: u64,
1111     pub(crate) eclass_size_limit: u64,
1112 }
1113