1 //! Support for egraphs represented in the DataFlowGraph. 2 3 use crate::FxHashSet; 4 use crate::alias_analysis::{AliasAnalysis, LastStores}; 5 use crate::ctxhash::{CtxEq, CtxHash, NullCtx}; 6 use crate::cursor::{Cursor, CursorPosition, FuncCursor}; 7 use crate::dominator_tree::DominatorTree; 8 use crate::egraph::elaborate::Elaborator; 9 use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph}; 10 use crate::ir::{ 11 Block, DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool, 12 }; 13 use crate::loop_analysis::LoopAnalysis; 14 use crate::opts::IsleContext; 15 use crate::opts::generated_code::SkeletonInstSimplification; 16 use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap}; 17 use crate::take_and_replace::TakeAndReplace; 18 use crate::trace; 19 use alloc::vec::Vec; 20 use core::cmp::Ordering; 21 use core::hash::Hasher; 22 use cranelift_control::ControlPlane; 23 use cranelift_entity::SecondaryMap; 24 use cranelift_entity::packed_option::ReservedValue; 25 use smallvec::SmallVec; 26 27 mod cost; 28 mod elaborate; 29 30 /// Pass over a Function that does the whole aegraph thing. 31 /// 32 /// - Removes non-skeleton nodes from the Layout. 33 /// - Performs a GVN-and-rule-application pass over all Values 34 /// reachable from the skeleton, potentially creating new Union 35 /// nodes (i.e., an aegraph) so that some values have multiple 36 /// representations. 37 /// - Does "extraction" on the aegraph: selects the best value out of 38 /// the tree-of-Union nodes for each used value. 39 /// - Does "scoped elaboration" on the aegraph: chooses one or more 40 /// locations for pure nodes to become instructions again in the 41 /// layout, as forced by the skeleton. 42 /// 43 /// At the beginning and end of this pass, the CLIF should be in a 44 /// state that passes the verifier and, additionally, has no Union 45 /// nodes. During the pass, Union nodes may exist, and instructions in 46 /// the layout may refer to results of instructions that are not 47 /// placed in the layout. 48 pub struct EgraphPass<'a> { 49 /// The function we're operating on. 50 func: &'a mut Function, 51 /// Dominator tree for the CFG, used to visit blocks in pre-order 52 /// so we see value definitions before their uses, and also used for 53 /// O(1) dominance checks. 54 domtree: &'a DominatorTree, 55 /// Alias analysis, used during optimization. 56 alias_analysis: &'a mut AliasAnalysis<'a>, 57 /// Loop analysis results, used for built-in LICM during 58 /// elaboration. 59 loop_analysis: &'a LoopAnalysis, 60 /// Chaos-mode control-plane so we can test that we still get 61 /// correct results when our heuristics make bad decisions. 62 ctrl_plane: &'a mut ControlPlane, 63 /// Which Values do we want to rematerialize in each block where 64 /// they're used? 65 remat_values: FxHashSet<Value>, 66 /// Stats collected while we run this pass. 67 pub(crate) stats: Stats, 68 } 69 70 /// The maximum number of rewrites we will take from a single call into ISLE. 71 const MATCHES_LIMIT: usize = 5; 72 73 /// The maximum number of enodes in any given eclass. 74 const ECLASS_ENODE_LIMIT: usize = 5; 75 76 /// Context passed through node insertion and optimization. 77 pub(crate) struct OptimizeCtx<'opt, 'analysis> 78 where 79 'analysis: 'opt, 80 { 81 // Borrowed from EgraphPass: 82 pub(crate) func: &'opt mut Function, 83 pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>, 84 available_block: &'opt mut SecondaryMap<Value, Block>, 85 eclass_size: &'opt mut SecondaryMap<Value, u8>, 86 pub(crate) gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Option<Value>>, 87 pub(crate) gvn_map_blocks: &'opt Vec<Block>, 88 pub(crate) remat_values: &'opt mut FxHashSet<Value>, 89 pub(crate) stats: &'opt mut Stats, 90 domtree: &'opt DominatorTree, 91 pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>, 92 pub(crate) alias_analysis_state: &'opt mut LastStores, 93 ctrl_plane: &'opt mut ControlPlane, 94 // Held locally during optimization of one node (recursively): 95 pub(crate) rewrite_depth: usize, 96 pub(crate) subsume_values: FxHashSet<Value>, 97 optimized_values: SmallVec<[Value; MATCHES_LIMIT]>, 98 optimized_insts: SmallVec<[SkeletonInstSimplification; MATCHES_LIMIT]>, 99 } 100 101 /// For passing to `insert_pure_enode`. Sometimes the enode already 102 /// exists as an Inst (from the original CLIF), and sometimes we're in 103 /// the middle of creating it and want to avoid inserting it if 104 /// possible until we know we need it. 105 pub(crate) enum NewOrExistingInst { 106 New(InstructionData, Type), 107 Existing(Inst), 108 } 109 110 impl NewOrExistingInst { get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData)111 fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) { 112 match self { 113 NewOrExistingInst::New(data, ty) => (*ty, *data), 114 NewOrExistingInst::Existing(inst) => { 115 let ty = dfg.ctrl_typevar(*inst); 116 (ty, dfg.insts[*inst]) 117 } 118 } 119 } 120 } 121 122 impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis> 123 where 124 'analysis: 'opt, 125 { 126 /// Optimization of a single instruction. 127 /// 128 /// This does a few things: 129 /// - Looks up the instruction in the GVN deduplication map. If we 130 /// already have the same instruction somewhere else, with the 131 /// same args, then we can alias the original instruction's 132 /// results and omit this instruction entirely. 133 /// - If the instruction is "new" (not deduplicated), then apply 134 /// optimization rules: 135 /// - All of the mid-end rules written in ISLE. 136 /// - Store-to-load forwarding. 137 /// - Update the value-to-opt-value map, and update the eclass 138 /// union-find, if we rewrote the value to different form(s). insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value139 pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value { 140 // Create the external context for looking up and updating the 141 // GVN map. This is necessary so that instructions themselves 142 // do not have to carry all the references or data for a full 143 // `Eq` or `Hash` impl. 144 let gvn_context = GVNContext { 145 value_lists: &self.func.dfg.value_lists, 146 }; 147 148 self.stats.pure_inst += 1; 149 if let NewOrExistingInst::New(..) = inst { 150 self.stats.new_inst += 1; 151 } 152 153 // Does this instruction already exist? If so, add entries to 154 // the value-map to rewrite uses of its results to the results 155 // of the original (existing) instruction. If not, optimize 156 // the new instruction. 157 if let Some(&Some(orig_result)) = self 158 .gvn_map 159 .get(&gvn_context, &inst.get_inst_key(&self.func.dfg)) 160 { 161 self.stats.pure_inst_deduped += 1; 162 if let NewOrExistingInst::Existing(inst) = inst { 163 debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1); 164 let result = self.func.dfg.first_result(inst); 165 self.value_to_opt_value[result] = orig_result; 166 self.available_block[result] = self.available_block[orig_result]; 167 } 168 orig_result 169 } else { 170 // Now actually insert the InstructionData and attach 171 // result value (exactly one). 172 let (inst, result, ty) = match inst { 173 NewOrExistingInst::New(data, typevar) => { 174 self.stats.pure_inst_insert_new += 1; 175 let inst = self.func.dfg.make_inst(data); 176 // TODO: reuse return value? 177 self.func.dfg.make_inst_results(inst, typevar); 178 let result = self.func.dfg.first_result(inst); 179 // New inst. We need to do the analysis of its result. 180 (inst, result, typevar) 181 } 182 NewOrExistingInst::Existing(inst) => { 183 self.stats.pure_inst_insert_orig += 1; 184 let result = self.func.dfg.first_result(inst); 185 let ty = self.func.dfg.ctrl_typevar(inst); 186 (inst, result, ty) 187 } 188 }; 189 190 self.available_block[result] = self.get_available_block(inst); 191 let opt_value = self.optimize_pure_enode(inst); 192 log::trace!("optimizing inst {inst} orig result {result} gave {opt_value}"); 193 194 let gvn_context = GVNContext { 195 value_lists: &self.func.dfg.value_lists, 196 }; 197 // Insert at level implied by args. This enables merging 198 // in LICM cases like: 199 // 200 // while (...) { 201 // if (...) { 202 // let x = loop_invariant_expr; 203 // } 204 // if (...) { 205 // let x = loop_invariant_expr; 206 // } 207 // } 208 // 209 // where the two instances of the expression otherwise 210 // wouldn't merge because each would be in a separate 211 // subscope of the scoped hashmap during traversal. 212 log::trace!( 213 "value {} is available at {}", 214 opt_value, 215 self.available_block[opt_value] 216 ); 217 let depth = self.depth_of_block_in_gvn_map(self.available_block[opt_value]); 218 self.gvn_map.insert_with_depth( 219 &gvn_context, 220 (ty, self.func.dfg.insts[inst]), 221 Some(opt_value), 222 depth, 223 ); 224 self.value_to_opt_value[result] = opt_value; 225 opt_value 226 } 227 } 228 229 /// Find the block where a pure instruction first becomes available, 230 /// defined as the block that is closest to the root where all of 231 /// its arguments are available. In the unusual case where a pure 232 /// instruction has no arguments (e.g. get_return_address), we can 233 /// place it anywhere, so it is available in the entry block. 234 /// 235 /// This function does not compute available blocks recursively. 236 /// All of the instruction's arguments must have had their available 237 /// blocks assigned already. get_available_block(&self, inst: Inst) -> Block238 fn get_available_block(&self, inst: Inst) -> Block { 239 // Side-effecting instructions have different rules for where 240 // they become available, so this function does not apply. 241 debug_assert!(is_pure_for_egraph(self.func, inst)); 242 243 // Note that the def-point of all arguments to an instruction 244 // in SSA lie on a line of direct ancestors in the domtree, and 245 // so do their available-blocks. This means that for any pair of 246 // arguments, their available blocks are either the same or one 247 // strictly dominates the other. We just need to find any argument 248 // whose available block is deepest in the domtree. 249 self.func.dfg.insts[inst] 250 .arguments(&self.func.dfg.value_lists) 251 .iter() 252 .map(|&v| { 253 let block = self.available_block[v]; 254 debug_assert!(!block.is_reserved_value()); 255 block 256 }) 257 .max_by(|&x, &y| { 258 if self.domtree.block_dominates(x, y) { 259 Ordering::Less 260 } else { 261 debug_assert!(self.domtree.block_dominates(y, x)); 262 Ordering::Greater 263 } 264 }) 265 .unwrap_or(self.func.layout.entry_block().unwrap()) 266 } 267 depth_of_block_in_gvn_map(&self, block: Block) -> usize268 fn depth_of_block_in_gvn_map(&self, block: Block) -> usize { 269 log::trace!( 270 "finding depth of available block {} in domtree stack: {:?}", 271 block, 272 self.gvn_map_blocks 273 ); 274 self.gvn_map_blocks 275 .iter() 276 .enumerate() 277 .rev() 278 .find(|&(_, b)| *b == block) 279 .unwrap() 280 .0 281 } 282 283 /// Optimizes an enode by applying any matching mid-end rewrite 284 /// rules (or store-to-load forwarding, which is a special case), 285 /// unioning together all possible optimized (or rewritten) forms 286 /// of this expression into an eclass and returning the `Value` 287 /// that represents that eclass. optimize_pure_enode(&mut self, inst: Inst) -> Value288 fn optimize_pure_enode(&mut self, inst: Inst) -> Value { 289 // A pure node always has exactly one result. 290 let orig_value = self.func.dfg.first_result(inst); 291 292 let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_values); 293 let (ctx, optimized_values) = guard.get(); 294 295 // Limit rewrite depth. When we apply optimization rules, they 296 // may create new nodes (values) and those are, recursively, 297 // optimized eagerly as soon as they are created. So we may 298 // have more than one ISLE invocation on the stack. (This is 299 // necessary so that as the toplevel builds the 300 // right-hand-side expression bottom-up, it uses the "latest" 301 // optimized values for all the constituent parts.) To avoid 302 // infinite or problematic recursion, we bound the rewrite 303 // depth to a small constant here. 304 const REWRITE_LIMIT: usize = 5; 305 if ctx.rewrite_depth > REWRITE_LIMIT { 306 ctx.stats.rewrite_depth_limit += 1; 307 return orig_value; 308 } 309 ctx.rewrite_depth += 1; 310 trace!("Incrementing rewrite depth; now {}", ctx.rewrite_depth); 311 312 // Invoke the ISLE toplevel constructor, getting all new 313 // values produced as equivalents to this value. 314 trace!("Calling into ISLE with original value {}", orig_value); 315 ctx.stats.rewrite_rule_invoked += 1; 316 debug_assert!(optimized_values.is_empty()); 317 crate::opts::generated_code::constructor_simplify( 318 &mut IsleContext { ctx }, 319 orig_value, 320 optimized_values, 321 ); 322 323 ctx.stats.rewrite_rule_results += optimized_values.len() as u64; 324 325 // It's not supposed to matter what order `simplify` returns values in. 326 ctx.ctrl_plane.shuffle(optimized_values); 327 328 let num_matches = optimized_values.len(); 329 if num_matches > MATCHES_LIMIT { 330 trace!( 331 "Reached maximum matches limit; too many optimized values \ 332 ({num_matches} > {MATCHES_LIMIT}); ignoring rest.", 333 ); 334 optimized_values.truncate(MATCHES_LIMIT); 335 } 336 337 // Sort and deduplicate optimized values, in case multiple 338 // rules produced the same simplification. 339 optimized_values.sort_unstable(); 340 optimized_values.dedup(); 341 342 trace!(" -> returned from ISLE: {orig_value} -> {optimized_values:?}"); 343 344 // Construct a union-node tree representing the new eclass 345 // that results from rewriting. If any returned value was 346 // marked "subsume", take only that value. Otherwise, 347 // sequentially build the chain over the original value and 348 // all returned values. 349 let result_value = if let Some(&subsuming_value) = optimized_values 350 .iter() 351 .find(|&value| ctx.subsume_values.contains(value)) 352 { 353 optimized_values.clear(); 354 ctx.stats.pure_inst_subsume += 1; 355 subsuming_value 356 } else { 357 let mut union_value = orig_value; 358 let mut eclass_size = ctx.eclass_size[orig_value] + 1; 359 for optimized_value in optimized_values.drain(..) { 360 trace!( 361 "Returned from ISLE for {}, got {:?}", 362 orig_value, optimized_value 363 ); 364 if optimized_value == orig_value { 365 trace!(" -> same as orig value; skipping"); 366 ctx.stats.pure_inst_rewrite_to_self += 1; 367 continue; 368 } 369 let rhs_eclass_size = ctx.eclass_size[optimized_value] + 1; 370 if usize::from(eclass_size) + usize::from(rhs_eclass_size) > ECLASS_ENODE_LIMIT { 371 trace!(" -> reached eclass size limit"); 372 ctx.stats.eclass_size_limit += 1; 373 break; 374 } 375 let old_union_value = union_value; 376 union_value = ctx.func.dfg.union(old_union_value, optimized_value); 377 eclass_size += rhs_eclass_size; 378 ctx.eclass_size[union_value] = eclass_size - 1; 379 ctx.stats.union += 1; 380 trace!(" -> union: now {}", union_value); 381 ctx.available_block[union_value] = 382 ctx.merge_availability(old_union_value, optimized_value); 383 } 384 union_value 385 }; 386 387 ctx.rewrite_depth -= 1; 388 trace!("Decrementing rewrite depth; now {}", ctx.rewrite_depth); 389 if ctx.rewrite_depth == 0 { 390 ctx.subsume_values.clear(); 391 } 392 393 debug_assert!(ctx.optimized_values.is_empty()); 394 result_value 395 } 396 merge_availability(&self, a: Value, b: Value) -> Block397 fn merge_availability(&self, a: Value, b: Value) -> Block { 398 let a = self.available_block[a]; 399 let b = self.available_block[b]; 400 if self.domtree.block_dominates(a, b) { 401 a 402 } else { 403 b 404 } 405 } 406 407 /// Optimize a "skeleton" instruction. 408 /// 409 /// Returns an optional command of how to continue processing the optimized 410 /// instruction (e.g. removing it or replacing it with a new instruction). optimize_skeleton_inst( &mut self, inst: Inst, block: Block, ) -> Option<SkeletonInstSimplification>411 fn optimize_skeleton_inst( 412 &mut self, 413 inst: Inst, 414 block: Block, 415 ) -> Option<SkeletonInstSimplification> { 416 self.stats.skeleton_inst += 1; 417 418 // If we have a rewrite rule for this instruction, do that first, so 419 // that GVN and alias analysis only see simplified skeleton 420 // instructions. 421 if let Some(cmd) = self.simplify_skeleton_inst(inst) { 422 self.stats.skeleton_inst_simplified += 1; 423 return Some(cmd); 424 } 425 426 // First, can we try to deduplicate? We need to keep some copy 427 // of the instruction around because it's side-effecting, but 428 // we may be able to reuse an earlier instance of it. 429 if is_mergeable_for_egraph(self.func, inst) { 430 let result = self.func.dfg.inst_results(inst).get(0).copied(); 431 trace!(" -> mergeable side-effecting op {}", inst); 432 433 // Does this instruction already exist? If so, add entries to 434 // the value-map to rewrite uses of its results to the results 435 // of the original (existing) instruction. If not, optimize 436 // the new instruction. 437 // 438 // Note that the GVN map is scoped, which is important 439 // here: because effectful ops are not removed from the 440 // skeleton (`Layout`), we need to be mindful of whether 441 // our current position is dominated by an instance of the 442 // instruction. (See #5796 for details.) 443 let ty = self.func.dfg.ctrl_typevar(inst); 444 match self 445 .gvn_map 446 .entry(&NullCtx, (ty, self.func.dfg.insts[inst])) 447 { 448 ScopedEntry::Occupied(o) => { 449 let orig_result = *o.get(); 450 match (result, orig_result) { 451 (Some(result), Some(orig_result)) => { 452 // Hit in GVN map -- reuse value. 453 self.stats.skeleton_inst_gvn += 1; 454 self.value_to_opt_value[result] = orig_result; 455 self.available_block[result] = self.available_block[orig_result]; 456 trace!(" -> merges result {} to {}", result, orig_result); 457 } 458 (None, None) => { 459 // Hit in the GVN map, but the instruction doesn't 460 // produce results, only side effects. Nothing else 461 // to do here. 462 self.stats.skeleton_inst_gvn += 1; 463 trace!(" -> merges with dominating instruction"); 464 } 465 (_, _) => unreachable!(), 466 } 467 Some(SkeletonInstSimplification::Remove) 468 } 469 ScopedEntry::Vacant(v) => { 470 // Otherwise, insert it into the value-map. 471 if let Some(result) = result { 472 self.value_to_opt_value[result] = result; 473 self.available_block[result] = block; 474 } 475 v.insert(result); 476 trace!(" -> inserts as new (no GVN)"); 477 None 478 } 479 } 480 } 481 // Otherwise, if a load or store, process it with the alias 482 // analysis to see if we can optimize it (rewrite in terms of 483 // an earlier load or stored value). 484 else if let Some(new_result) = 485 self.alias_analysis 486 .process_inst(self.func, self.alias_analysis_state, inst) 487 { 488 self.stats.alias_analysis_removed += 1; 489 let result = self.func.dfg.first_result(inst); 490 trace!( 491 " -> inst {} has result {} replaced with {}", 492 inst, result, new_result 493 ); 494 self.value_to_opt_value[result] = new_result; 495 self.available_block[result] = self.available_block[new_result]; 496 Some(SkeletonInstSimplification::Remove) 497 } 498 // Otherwise, generic side-effecting op -- always keep it, and 499 // set its results to identity-map to original values. 500 else { 501 // Set all results to identity-map to themselves 502 // in the value-to-opt-value map. 503 for &result in self.func.dfg.inst_results(inst) { 504 self.value_to_opt_value[result] = result; 505 self.available_block[result] = block; 506 } 507 None 508 } 509 } 510 511 /// Find the best simplification of the given skeleton instruction, if any, 512 /// by consulting our `simplify_skeleton` ISLE rules. simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification>513 fn simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification> { 514 // We cannot currently simplify terminators, or simplify into 515 // terminators. Anything that could change the control-flow graph is off 516 // limits. 517 // 518 // Consider the following CLIF snippet: 519 // 520 // block0(v0: i64): 521 // v1 = iconst.i32 0 522 // trapz v1, user42 523 // v2 = load.i32 v0 524 // brif v1, block1, block2 525 // block1: 526 // return v2 527 // block2: 528 // v3 = iconst.i32 1 529 // v4 = iadd v2, v3 530 // return v4 531 // 532 // We would ideally like to perform simplifications like replacing the 533 // `trapz` with an unconditional `trap` and the conditional `brif` 534 // branch with an unconditional `jump`. Note, however, that blocks 535 // `block1` and `block2` are dominated by `block0` and therefore can and 536 // do use values defined in `block0`. This presents challenges: 537 // 538 // * If we replace the `brif` with a `jump`, then we've mutated the 539 // control-flow graph and removed that domination property. The uses 540 // of `v2` and `v3` in those blocks become invalid. 541 // 542 // * Even worse, if we turn the `trapz` into a `trap`, we are 543 // introducing a terminator into the middle of the block, which leaves 544 // us with two choices to fix up the IR so that there aren't any 545 // instructions following the terminator in the block: 546 // 547 // 1. We can split the unreachable instructions off into a new 548 // block. However, there is no control-flow edge from the current 549 // block to this new block and so, again, the new block isn't 550 // dominated by the current block, and therefore the can't use 551 // values defined in this block or any dominating it. The `load` 552 // instruction uses `v0` but is not dominated by `v0`'s 553 // definition. 554 // 555 // 2. Alternatively, we can simply delete the trailing instructions, 556 // since they are unreachable. But then not only are the old 557 // instructions' uses no longer dominated by their definitions, but 558 // the definitions do not exist at all anymore! 559 // 560 // Whatever approach we would take, we would invalidate value uses, and 561 // would need to track and fix them up. 562 if self.func.dfg.insts[inst].opcode().is_branch() { 563 return None; 564 } 565 566 let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_insts); 567 let (ctx, optimized_insts) = guard.get(); 568 569 crate::opts::generated_code::constructor_simplify_skeleton( 570 &mut IsleContext { ctx }, 571 inst, 572 optimized_insts, 573 ); 574 575 let simplifications_len = optimized_insts.len(); 576 log::trace!(" -> simplify_skeleton: yielded {simplifications_len} simplification(s)"); 577 if simplifications_len > MATCHES_LIMIT { 578 log::trace!(" too many candidate simplifications; truncating to {MATCHES_LIMIT}"); 579 optimized_insts.truncate(MATCHES_LIMIT); 580 } 581 582 // Find the best simplification, if any, from our candidates. 583 // 584 // Unlike simplifying pure values, we do not add side-effectful 585 // instructions to the egraph, nor do we extract the best version via 586 // dynamic programming and considering the costs of operands. Instead, 587 // we greedily choose the best simplification. This is because there is 588 // an impedance mismatch: the egraph and our pure rewrites are centered 589 // around *values*, but we don't represent side-effects with values, we 590 // represent them implicitly in their *instructions*. 591 // 592 // The initial best choice is "no simplification, just use the original 593 // instruction" which has the original instruction's cost. 594 let mut best = None; 595 let mut best_cost = cost::Cost::of_skeleton_op( 596 ctx.func.dfg.insts[inst].opcode(), 597 ctx.func.dfg.inst_args(inst).len(), 598 ); 599 while let Some(simplification) = optimized_insts.pop() { 600 let (new_inst, new_val) = match simplification { 601 // We can't do better than completely removing the skeleton 602 // instruction, so short-cicuit the loop and eagerly return the 603 // `Remove*` simplifications. 604 SkeletonInstSimplification::Remove => { 605 log::trace!(" -> simplify_skeleton: remove inst"); 606 debug_assert!(ctx.func.dfg.inst_results(inst).is_empty()); 607 return Some(simplification); 608 } 609 SkeletonInstSimplification::RemoveWithVal { val } => { 610 log::trace!(" -> simplify_skeleton: remove inst and use {val} as its result"); 611 if cfg!(debug_assertions) { 612 let results = ctx.func.dfg.inst_results(inst); 613 debug_assert_eq!(results.len(), 1); 614 debug_assert_eq!( 615 ctx.func.dfg.value_type(results[0]), 616 ctx.func.dfg.value_type(val), 617 ); 618 } 619 return Some(simplification); 620 } 621 622 // For instruction replacement simplification, we want to check 623 // that the replacements define the same number and types of 624 // values as the original instruction, and also determine 625 // whether they are actually an improvement over (i.e. have 626 // lower cost than) the original instruction. 627 SkeletonInstSimplification::Replace { inst } => { 628 log::trace!( 629 " -> simplify_skeleton: replace inst with {inst}: {}", 630 ctx.func.dfg.display_inst(inst) 631 ); 632 (inst, None) 633 } 634 SkeletonInstSimplification::ReplaceWithVal { inst, val } => { 635 log::trace!( 636 " -> simplify_skeleton: replace inst with {val} and {inst}: {}", 637 ctx.func.dfg.display_inst(inst) 638 ); 639 (inst, Some(val)) 640 } 641 }; 642 643 if cfg!(debug_assertions) { 644 let opcode = ctx.func.dfg.insts[inst].opcode(); 645 debug_assert!( 646 !(opcode.is_terminator() || opcode.is_branch()), 647 "simplifying control-flow instructions and terminators is not yet supported", 648 ); 649 650 let old_vals = ctx.func.dfg.inst_results(inst); 651 let new_vals = if let Some(val) = new_val.as_ref() { 652 core::slice::from_ref(val) 653 } else { 654 ctx.func.dfg.inst_results(new_inst) 655 }; 656 debug_assert_eq!( 657 old_vals.len(), 658 new_vals.len(), 659 "skeleton simplification should result in the same number of result values", 660 ); 661 662 for (old_val, new_val) in old_vals.iter().zip(new_vals) { 663 let old_ty = ctx.func.dfg.value_type(*old_val); 664 let new_ty = ctx.func.dfg.value_type(*new_val); 665 debug_assert_eq!( 666 old_ty, new_ty, 667 "skeleton simplification should result in values of the correct type", 668 ); 669 } 670 } 671 672 // Our best simplification is the one with the least cost. Update 673 // `best` if necessary. 674 let cost = cost::Cost::of_skeleton_op( 675 ctx.func.dfg.insts[new_inst].opcode(), 676 ctx.func.dfg.inst_args(new_inst).len(), 677 ); 678 if cost < best_cost { 679 best = Some(simplification); 680 best_cost = cost; 681 } 682 } 683 684 // Return the best simplification! 685 best 686 } 687 } 688 689 impl<'a> EgraphPass<'a> { 690 /// Create a new EgraphPass. new( func: &'a mut Function, domtree: &'a DominatorTree, loop_analysis: &'a LoopAnalysis, alias_analysis: &'a mut AliasAnalysis<'a>, ctrl_plane: &'a mut ControlPlane, ) -> Self691 pub fn new( 692 func: &'a mut Function, 693 domtree: &'a DominatorTree, 694 loop_analysis: &'a LoopAnalysis, 695 alias_analysis: &'a mut AliasAnalysis<'a>, 696 ctrl_plane: &'a mut ControlPlane, 697 ) -> Self { 698 Self { 699 func, 700 domtree, 701 loop_analysis, 702 alias_analysis, 703 ctrl_plane, 704 stats: Stats::default(), 705 remat_values: FxHashSet::default(), 706 } 707 } 708 709 /// Run the process. run(&mut self)710 pub fn run(&mut self) { 711 self.remove_pure_and_optimize(); 712 713 trace!("egraph built:\n{}\n", self.func.display()); 714 if cfg!(feature = "trace-log") { 715 for (value, def) in self.func.dfg.values_and_defs() { 716 trace!(" -> {} = {:?}", value, def); 717 match def { 718 ValueDef::Result(i, 0) => { 719 trace!(" -> {} = {:?}", i, self.func.dfg.insts[i]); 720 } 721 _ => {} 722 } 723 } 724 } 725 726 self.elaborate(); 727 728 log::trace!("stats: {:#?}", self.stats); 729 } 730 731 /// Remove pure nodes from the `Layout` of the function, ensuring 732 /// that only the "side-effect skeleton" remains, and also 733 /// optimize the pure nodes. This is the first step of 734 /// egraph-based processing and turns the pure CFG-based CLIF into 735 /// a CFG skeleton with a sea of (optimized) nodes tying it 736 /// together. 737 /// 738 /// As we walk through the code, we eagerly apply optimization 739 /// rules; at any given point we have a "latest version" of an 740 /// eclass of possible representations for a `Value` in the 741 /// original program, which is itself a `Value` at the root of a 742 /// union-tree. We keep a map from the original values to these 743 /// optimized values. When we encounter any instruction (pure or 744 /// side-effecting skeleton) we rewrite its arguments to capture 745 /// the "latest" optimized forms of these values. (We need to do 746 /// this as part of this pass, and not later using a finished map, 747 /// because the eclass can continue to be updated and we need to 748 /// only refer to its subset that exists at this stage, to 749 /// maintain acyclicity.) remove_pure_and_optimize(&mut self)750 fn remove_pure_and_optimize(&mut self) { 751 let mut cursor = FuncCursor::new(self.func); 752 let mut value_to_opt_value: SecondaryMap<Value, Value> = 753 SecondaryMap::with_default(Value::reserved_value()); 754 755 // Map from instruction to value for hash-consing of pure ops 756 // into the egraph. This can be a standard (non-scoped) 757 // hashmap because pure ops have no location: they are 758 // "outside of" control flow. 759 // 760 // Note also that we keep the controlling typevar (the `Type` 761 // in the tuple below) because it may disambiguate 762 // instructions that are identical except for type. 763 // 764 // We store both skeleton and non-skeleton instructions in the 765 // GVN map; for skeleton instructions, we only store those 766 // that are idempotent, i.e., still eligible to GVN. Note that 767 // some skeleton instructions are idempotent but do not 768 // produce a value: e.g., traps on a given condition. To allow 769 // for both cases, we store an `Option<Value>` as the value in 770 // this map. 771 let mut gvn_map: ScopedHashMap<(Type, InstructionData), Option<Value>> = 772 ScopedHashMap::with_capacity(cursor.func.dfg.num_values()); 773 774 // The block in the domtree preorder traversal at each level 775 // of the GVN map. 776 let mut gvn_map_blocks: Vec<Block> = vec![]; 777 778 // To get the best possible merging and canonicalization, we 779 // track where a value is "available" at: this is the 780 // domtree-nearest-ancestor join of all args if the value 781 // itself is pure, otherwise the block where the value is 782 // defined. (And for union nodes, the 783 // domtree-highest-ancestor, i.e., the meet or the dual to the 784 // above join.) 785 let mut available_block: SecondaryMap<Value, Block> = 786 SecondaryMap::with_default(Block::reserved_value()); 787 788 // To avoid blowing up eclasses too much, we track the size of 789 // each eclass reachable by a tree of union nodes from a given 790 // value ID, and we avoid union'ing additional values into an 791 // eclass when it reaches `ECLASS_ENODE_LIMIT`. 792 // 793 // For efficiency, this encodes size minus one: so a value of 794 // zero (which is cheap to bulk-initialize) means a singleton 795 // eclass of size one. This also allows us to avoid explicitly 796 // writing the size for any values that are not union nodes. 797 let mut eclass_size: SecondaryMap<Value, u8> = SecondaryMap::with_default(0); 798 799 // This is an initial guess at the size we'll need, but we add 800 // more values as we build simplified alternative expressions so 801 // this is likely to realloc again later. 802 available_block.resize(cursor.func.dfg.num_values()); 803 804 // In domtree preorder, visit blocks. (TODO: factor out an 805 // iterator from this and elaborator.) 806 let root = cursor.layout().entry_block().unwrap(); 807 enum StackEntry { 808 Visit(Block), 809 Pop, 810 } 811 let mut block_stack = vec![StackEntry::Visit(root)]; 812 while let Some(entry) = block_stack.pop() { 813 match entry { 814 StackEntry::Visit(block) => { 815 // We popped this block; push children 816 // immediately, then process this block. 817 block_stack.push(StackEntry::Pop); 818 block_stack.extend( 819 self.ctrl_plane 820 .shuffled(self.domtree.children(block)) 821 .map(StackEntry::Visit), 822 ); 823 gvn_map.increment_depth(); 824 gvn_map_blocks.push(block); 825 826 trace!("Processing block {}", block); 827 cursor.set_position(CursorPosition::Before(block)); 828 829 let mut alias_analysis_state = self.alias_analysis.block_starting_state(block); 830 831 for ¶m in cursor.func.dfg.block_params(block) { 832 trace!("creating initial singleton eclass for blockparam {}", param); 833 value_to_opt_value[param] = param; 834 available_block[param] = block; 835 } 836 while let Some(inst) = cursor.next_inst() { 837 trace!( 838 "Processing inst {inst}: {}", 839 cursor.func.dfg.display_inst(inst), 840 ); 841 842 // Rewrite args of *all* instructions using the 843 // value-to-opt-value map. 844 cursor.func.dfg.map_inst_values(inst, |arg| { 845 let new_value = value_to_opt_value[arg]; 846 trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value); 847 debug_assert_ne!(new_value, Value::reserved_value()); 848 new_value 849 }); 850 851 // Build a context for optimization, with borrows of 852 // state. We can't invoke a method on `self` because 853 // we've borrowed `self.func` mutably (as 854 // `cursor.func`) so we pull apart the pieces instead 855 // here. 856 let mut ctx = OptimizeCtx { 857 func: cursor.func, 858 value_to_opt_value: &mut value_to_opt_value, 859 gvn_map: &mut gvn_map, 860 gvn_map_blocks: &mut gvn_map_blocks, 861 available_block: &mut available_block, 862 eclass_size: &mut eclass_size, 863 rewrite_depth: 0, 864 subsume_values: FxHashSet::default(), 865 remat_values: &mut self.remat_values, 866 stats: &mut self.stats, 867 domtree: &self.domtree, 868 alias_analysis: self.alias_analysis, 869 alias_analysis_state: &mut alias_analysis_state, 870 ctrl_plane: self.ctrl_plane, 871 optimized_values: Default::default(), 872 optimized_insts: Default::default(), 873 }; 874 875 if is_pure_for_egraph(ctx.func, inst) { 876 // Insert into GVN map and optimize any new nodes 877 // inserted (recursively performing this work for 878 // any nodes the optimization rules produce). 879 let inst = NewOrExistingInst::Existing(inst); 880 ctx.insert_pure_enode(inst); 881 // We've now rewritten all uses, or will when we 882 // see them, and the instruction exists as a pure 883 // enode in the eclass, so we can remove it. 884 cursor.remove_inst_and_step_back(); 885 } else { 886 if let Some(cmd) = ctx.optimize_skeleton_inst(inst, block) { 887 Self::execute_skeleton_inst_simplification( 888 cmd, 889 &mut cursor, 890 &mut value_to_opt_value, 891 inst, 892 ); 893 } 894 } 895 } 896 } 897 StackEntry::Pop => { 898 gvn_map.decrement_depth(); 899 gvn_map_blocks.pop(); 900 } 901 } 902 } 903 } 904 905 /// Execute a simplification of an instruction in the side-effectful 906 /// skeleton. execute_skeleton_inst_simplification( simplification: SkeletonInstSimplification, cursor: &mut FuncCursor, value_to_opt_value: &mut SecondaryMap<Value, Value>, old_inst: Inst, )907 fn execute_skeleton_inst_simplification( 908 simplification: SkeletonInstSimplification, 909 cursor: &mut FuncCursor, 910 value_to_opt_value: &mut SecondaryMap<Value, Value>, 911 old_inst: Inst, 912 ) { 913 let mut forward_val = |cursor: &mut FuncCursor, old_val, new_val| { 914 cursor.func.dfg.change_to_alias(old_val, new_val); 915 value_to_opt_value[old_val] = new_val; 916 }; 917 918 let (new_inst, new_val) = match simplification { 919 SkeletonInstSimplification::Remove => { 920 cursor.remove_inst_and_step_back(); 921 return; 922 } 923 SkeletonInstSimplification::RemoveWithVal { val } => { 924 cursor.remove_inst_and_step_back(); 925 let old_val = cursor.func.dfg.first_result(old_inst); 926 cursor.func.dfg.detach_inst_results(old_inst); 927 forward_val(cursor, old_val, val); 928 return; 929 } 930 SkeletonInstSimplification::Replace { inst } => (inst, None), 931 SkeletonInstSimplification::ReplaceWithVal { inst, val } => (inst, Some(val)), 932 }; 933 934 // Replace the old instruction with the new one. 935 cursor.replace_inst(new_inst); 936 debug_assert!(!cursor.func.dfg.insts[new_inst].opcode().is_terminator()); 937 938 // Redirect the old instruction's result values to our new instruction's 939 // result values. 940 let mut i = 0; 941 let mut next_new_val = |dfg: &crate::ir::DataFlowGraph| -> Value { 942 if let Some(val) = new_val { 943 val 944 } else { 945 let val = dfg.inst_results(new_inst)[i]; 946 i += 1; 947 val 948 } 949 }; 950 for i in 0..cursor.func.dfg.inst_results(old_inst).len() { 951 let old_val = cursor.func.dfg.inst_results(old_inst)[i]; 952 let new_val = next_new_val(&cursor.func.dfg); 953 forward_val(cursor, old_val, new_val); 954 } 955 956 // Back up so that the next iteration of the outer egraph loop will 957 // process the new instruction. 958 cursor.goto_inst(new_inst); 959 cursor.prev_inst(); 960 } 961 962 /// Scoped elaboration: compute a final ordering of op computation 963 /// for each block and update the given Func body. After this 964 /// runs, the function body is back into the state where every 965 /// Inst with an used result is placed in the layout (possibly 966 /// duplicated, if our code-motion logic decides this is the best 967 /// option). 968 /// 969 /// This works in concert with the domtree. We do a preorder 970 /// traversal of the domtree, tracking a scoped map from Id to 971 /// (new) Value. The map's scopes correspond to levels in the 972 /// domtree. 973 /// 974 /// At each block, we iterate forward over the side-effecting 975 /// eclasses, and recursively generate their arg eclasses, then 976 /// emit the ops themselves. 977 /// 978 /// To use an eclass in a given block, we first look it up in the 979 /// scoped map, and get the Value if already present. If not, we 980 /// need to generate it. We emit the extracted enode for this 981 /// eclass after recursively generating its args. Eclasses are 982 /// thus computed "as late as possible", but then memoized into 983 /// the Id-to-Value map and available to all dominated blocks and 984 /// for the rest of this block. (This subsumes GVN.) elaborate(&mut self)985 fn elaborate(&mut self) { 986 let mut elaborator = Elaborator::new( 987 self.func, 988 &self.domtree, 989 self.loop_analysis, 990 &self.remat_values, 991 &mut self.stats, 992 self.ctrl_plane, 993 ); 994 elaborator.elaborate(); 995 996 self.check_post_egraph(); 997 } 998 999 #[cfg(debug_assertions)] check_post_egraph(&self)1000 fn check_post_egraph(&self) { 1001 // Verify that no union nodes are reachable from inst args, 1002 // and that all inst args' defining instructions are in the 1003 // layout. 1004 for block in self.func.layout.blocks() { 1005 for inst in self.func.layout.block_insts(block) { 1006 self.func 1007 .dfg 1008 .inst_values(inst) 1009 .for_each(|arg| match self.func.dfg.value_def(arg) { 1010 ValueDef::Result(i, _) => { 1011 debug_assert!(self.func.layout.inst_block(i).is_some()); 1012 } 1013 ValueDef::Union(..) => { 1014 panic!("egraph union node {arg} still reachable at {inst}!"); 1015 } 1016 _ => {} 1017 }) 1018 } 1019 } 1020 } 1021 1022 #[cfg(not(debug_assertions))] check_post_egraph(&self)1023 fn check_post_egraph(&self) {} 1024 } 1025 1026 /// Implementation of external-context equality and hashing on 1027 /// InstructionData. This allows us to deduplicate instructions given 1028 /// some context that lets us see its value lists, so we don't need to 1029 /// store arguments inline in the `InstructionData` (or alongside it in 1030 /// some newly-defined key type) in all cases. 1031 struct GVNContext<'a> { 1032 value_lists: &'a ValueListPool, 1033 } 1034 1035 impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> { ctx_eq( &self, (a_ty, a_inst): &(Type, InstructionData), (b_ty, b_inst): &(Type, InstructionData), ) -> bool1036 fn ctx_eq( 1037 &self, 1038 (a_ty, a_inst): &(Type, InstructionData), 1039 (b_ty, b_inst): &(Type, InstructionData), 1040 ) -> bool { 1041 a_ty == b_ty && a_inst.eq(b_inst, self.value_lists) 1042 } 1043 } 1044 1045 impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> { ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData))1046 fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) { 1047 core::hash::Hash::hash(&ty, state); 1048 inst.hash(state, self.value_lists); 1049 } 1050 } 1051 1052 /// Statistics collected during egraph-based processing. 1053 #[derive(Clone, Debug, Default)] 1054 pub(crate) struct Stats { 1055 pub(crate) pure_inst: u64, 1056 pub(crate) pure_inst_deduped: u64, 1057 pub(crate) pure_inst_subsume: u64, 1058 pub(crate) pure_inst_rewrite_to_self: u64, 1059 pub(crate) pure_inst_insert_orig: u64, 1060 pub(crate) pure_inst_insert_new: u64, 1061 pub(crate) skeleton_inst: u64, 1062 pub(crate) skeleton_inst_simplified: u64, 1063 pub(crate) skeleton_inst_gvn: u64, 1064 pub(crate) alias_analysis_removed: u64, 1065 pub(crate) new_inst: u64, 1066 pub(crate) union: u64, 1067 pub(crate) subsume: u64, 1068 pub(crate) remat: u64, 1069 pub(crate) rewrite_rule_invoked: u64, 1070 pub(crate) rewrite_rule_results: u64, 1071 pub(crate) rewrite_depth_limit: u64, 1072 pub(crate) elaborate_visit_node: u64, 1073 pub(crate) elaborate_memoize_hit: u64, 1074 pub(crate) elaborate_memoize_miss: u64, 1075 pub(crate) elaborate_remat: u64, 1076 pub(crate) elaborate_licm_hoist: u64, 1077 pub(crate) elaborate_func: u64, 1078 pub(crate) elaborate_func_pre_insts: u64, 1079 pub(crate) elaborate_func_post_insts: u64, 1080 pub(crate) eclass_size_limit: u64, 1081 } 1082