1 //! Support for egraphs represented in the DataFlowGraph. 2 3 use crate::FxHashSet; 4 use crate::alias_analysis::{AliasAnalysis, LastStores}; 5 use crate::ctxhash::{CtxEq, CtxHash, NullCtx}; 6 use crate::cursor::{Cursor, CursorPosition, FuncCursor}; 7 use crate::dominator_tree::DominatorTree; 8 use crate::egraph::elaborate::Elaborator; 9 use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph}; 10 use crate::ir::pcc::Fact; 11 use crate::ir::{ 12 Block, DataFlowGraph, Function, Inst, InstructionData, Opcode, Type, Value, ValueDef, 13 ValueListPool, 14 }; 15 use crate::loop_analysis::LoopAnalysis; 16 use crate::opts::IsleContext; 17 use crate::opts::generated_code::SkeletonInstSimplification; 18 use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap}; 19 use crate::settings::Flags; 20 use crate::take_and_replace::TakeAndReplace; 21 use crate::trace; 22 use alloc::vec::Vec; 23 use core::cmp::Ordering; 24 use core::hash::Hasher; 25 use cranelift_control::ControlPlane; 26 use cranelift_entity::SecondaryMap; 27 use cranelift_entity::packed_option::ReservedValue; 28 use smallvec::SmallVec; 29 30 mod cost; 31 mod elaborate; 32 33 /// Pass over a Function that does the whole aegraph thing. 34 /// 35 /// - Removes non-skeleton nodes from the Layout. 36 /// - Performs a GVN-and-rule-application pass over all Values 37 /// reachable from the skeleton, potentially creating new Union 38 /// nodes (i.e., an aegraph) so that some values have multiple 39 /// representations. 40 /// - Does "extraction" on the aegraph: selects the best value out of 41 /// the tree-of-Union nodes for each used value. 42 /// - Does "scoped elaboration" on the aegraph: chooses one or more 43 /// locations for pure nodes to become instructions again in the 44 /// layout, as forced by the skeleton. 45 /// 46 /// At the beginning and end of this pass, the CLIF should be in a 47 /// state that passes the verifier and, additionally, has no Union 48 /// nodes. During the pass, Union nodes may exist, and instructions in 49 /// the layout may refer to results of instructions that are not 50 /// placed in the layout. 51 pub struct EgraphPass<'a> { 52 /// The function we're operating on. 53 func: &'a mut Function, 54 /// Dominator tree for the CFG, used to visit blocks in pre-order 55 /// so we see value definitions before their uses, and also used for 56 /// O(1) dominance checks. 57 domtree: &'a DominatorTree, 58 /// Alias analysis, used during optimization. 59 alias_analysis: &'a mut AliasAnalysis<'a>, 60 /// Loop analysis results, used for built-in LICM during 61 /// elaboration. 62 loop_analysis: &'a LoopAnalysis, 63 /// Compiler flags. 64 flags: &'a Flags, 65 /// Chaos-mode control-plane so we can test that we still get 66 /// correct results when our heuristics make bad decisions. 67 ctrl_plane: &'a mut ControlPlane, 68 /// Which Values do we want to rematerialize in each block where 69 /// they're used? 70 remat_values: FxHashSet<Value>, 71 /// Stats collected while we run this pass. 72 pub(crate) stats: Stats, 73 } 74 75 /// The maximum number of rewrites we will take from a single call into ISLE. 76 const MATCHES_LIMIT: usize = 5; 77 78 /// The maximum number of enodes in any given eclass. 79 const ECLASS_ENODE_LIMIT: usize = 5; 80 81 /// Context passed through node insertion and optimization. 82 pub(crate) struct OptimizeCtx<'opt, 'analysis> 83 where 84 'analysis: 'opt, 85 { 86 // Borrowed from EgraphPass: 87 pub(crate) func: &'opt mut Function, 88 pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>, 89 available_block: &'opt mut SecondaryMap<Value, Block>, 90 eclass_size: &'opt mut SecondaryMap<Value, u8>, 91 pub(crate) gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Option<Value>>, 92 pub(crate) gvn_map_blocks: &'opt Vec<Block>, 93 pub(crate) remat_values: &'opt mut FxHashSet<Value>, 94 pub(crate) stats: &'opt mut Stats, 95 domtree: &'opt DominatorTree, 96 pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>, 97 pub(crate) alias_analysis_state: &'opt mut LastStores, 98 flags: &'opt Flags, 99 ctrl_plane: &'opt mut ControlPlane, 100 // Held locally during optimization of one node (recursively): 101 pub(crate) rewrite_depth: usize, 102 pub(crate) subsume_values: FxHashSet<Value>, 103 optimized_values: SmallVec<[Value; MATCHES_LIMIT]>, 104 optimized_insts: SmallVec<[SkeletonInstSimplification; MATCHES_LIMIT]>, 105 } 106 107 /// For passing to `insert_pure_enode`. Sometimes the enode already 108 /// exists as an Inst (from the original CLIF), and sometimes we're in 109 /// the middle of creating it and want to avoid inserting it if 110 /// possible until we know we need it. 111 pub(crate) enum NewOrExistingInst { 112 New(InstructionData, Type), 113 Existing(Inst), 114 } 115 116 impl NewOrExistingInst { 117 fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) { 118 match self { 119 NewOrExistingInst::New(data, ty) => (*ty, *data), 120 NewOrExistingInst::Existing(inst) => { 121 let ty = dfg.ctrl_typevar(*inst); 122 (ty, dfg.insts[*inst]) 123 } 124 } 125 } 126 } 127 128 impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis> 129 where 130 'analysis: 'opt, 131 { 132 /// Optimization of a single instruction. 133 /// 134 /// This does a few things: 135 /// - Looks up the instruction in the GVN deduplication map. If we 136 /// already have the same instruction somewhere else, with the 137 /// same args, then we can alias the original instruction's 138 /// results and omit this instruction entirely. 139 /// - If the instruction is "new" (not deduplicated), then apply 140 /// optimization rules: 141 /// - All of the mid-end rules written in ISLE. 142 /// - Store-to-load forwarding. 143 /// - Update the value-to-opt-value map, and update the eclass 144 /// union-find, if we rewrote the value to different form(s). 145 pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value { 146 // Create the external context for looking up and updating the 147 // GVN map. This is necessary so that instructions themselves 148 // do not have to carry all the references or data for a full 149 // `Eq` or `Hash` impl. 150 let gvn_context = GVNContext { 151 value_lists: &self.func.dfg.value_lists, 152 }; 153 154 self.stats.pure_inst += 1; 155 if let NewOrExistingInst::New(..) = inst { 156 self.stats.new_inst += 1; 157 } 158 159 // Does this instruction already exist? If so, add entries to 160 // the value-map to rewrite uses of its results to the results 161 // of the original (existing) instruction. If not, optimize 162 // the new instruction. 163 if let Some(&Some(orig_result)) = self 164 .gvn_map 165 .get(&gvn_context, &inst.get_inst_key(&self.func.dfg)) 166 { 167 self.stats.pure_inst_deduped += 1; 168 if let NewOrExistingInst::Existing(inst) = inst { 169 debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1); 170 let result = self.func.dfg.first_result(inst); 171 self.value_to_opt_value[result] = orig_result; 172 self.available_block[result] = self.available_block[orig_result]; 173 self.func.dfg.merge_facts(result, orig_result); 174 } 175 orig_result 176 } else { 177 // Now actually insert the InstructionData and attach 178 // result value (exactly one). 179 let (inst, result, ty) = match inst { 180 NewOrExistingInst::New(data, typevar) => { 181 self.stats.pure_inst_insert_new += 1; 182 let inst = self.func.dfg.make_inst(data); 183 // TODO: reuse return value? 184 self.func.dfg.make_inst_results(inst, typevar); 185 let result = self.func.dfg.first_result(inst); 186 // New inst. We need to do the analysis of its result. 187 (inst, result, typevar) 188 } 189 NewOrExistingInst::Existing(inst) => { 190 self.stats.pure_inst_insert_orig += 1; 191 let result = self.func.dfg.first_result(inst); 192 let ty = self.func.dfg.ctrl_typevar(inst); 193 (inst, result, ty) 194 } 195 }; 196 197 self.attach_constant_fact(inst, result, ty); 198 199 self.available_block[result] = self.get_available_block(inst); 200 let opt_value = self.optimize_pure_enode(inst); 201 log::trace!("optimizing inst {inst} orig result {result} gave {opt_value}"); 202 203 let gvn_context = GVNContext { 204 value_lists: &self.func.dfg.value_lists, 205 }; 206 // Insert at level implied by args. This enables merging 207 // in LICM cases like: 208 // 209 // while (...) { 210 // if (...) { 211 // let x = loop_invariant_expr; 212 // } 213 // if (...) { 214 // let x = loop_invariant_expr; 215 // } 216 // } 217 // 218 // where the two instances of the expression otherwise 219 // wouldn't merge because each would be in a separate 220 // subscope of the scoped hashmap during traversal. 221 log::trace!( 222 "value {} is available at {}", 223 opt_value, 224 self.available_block[opt_value] 225 ); 226 let depth = self.depth_of_block_in_gvn_map(self.available_block[opt_value]); 227 self.gvn_map.insert_with_depth( 228 &gvn_context, 229 (ty, self.func.dfg.insts[inst]), 230 Some(opt_value), 231 depth, 232 ); 233 self.value_to_opt_value[result] = opt_value; 234 opt_value 235 } 236 } 237 238 /// Find the block where a pure instruction first becomes available, 239 /// defined as the block that is closest to the root where all of 240 /// its arguments are available. In the unusual case where a pure 241 /// instruction has no arguments (e.g. get_return_address), we can 242 /// place it anywhere, so it is available in the entry block. 243 /// 244 /// This function does not compute available blocks recursively. 245 /// All of the instruction's arguments must have had their available 246 /// blocks assigned already. 247 fn get_available_block(&self, inst: Inst) -> Block { 248 // Side-effecting instructions have different rules for where 249 // they become available, so this function does not apply. 250 debug_assert!(is_pure_for_egraph(self.func, inst)); 251 252 // Note that the def-point of all arguments to an instruction 253 // in SSA lie on a line of direct ancestors in the domtree, and 254 // so do their available-blocks. This means that for any pair of 255 // arguments, their available blocks are either the same or one 256 // strictly dominates the other. We just need to find any argument 257 // whose available block is deepest in the domtree. 258 self.func.dfg.insts[inst] 259 .arguments(&self.func.dfg.value_lists) 260 .iter() 261 .map(|&v| { 262 let block = self.available_block[v]; 263 debug_assert!(!block.is_reserved_value()); 264 block 265 }) 266 .max_by(|&x, &y| { 267 if self.domtree.block_dominates(x, y) { 268 Ordering::Less 269 } else { 270 debug_assert!(self.domtree.block_dominates(y, x)); 271 Ordering::Greater 272 } 273 }) 274 .unwrap_or(self.func.layout.entry_block().unwrap()) 275 } 276 277 fn depth_of_block_in_gvn_map(&self, block: Block) -> usize { 278 log::trace!( 279 "finding depth of available block {} in domtree stack: {:?}", 280 block, 281 self.gvn_map_blocks 282 ); 283 self.gvn_map_blocks 284 .iter() 285 .enumerate() 286 .rev() 287 .find(|&(_, b)| *b == block) 288 .unwrap() 289 .0 290 } 291 292 /// Optimizes an enode by applying any matching mid-end rewrite 293 /// rules (or store-to-load forwarding, which is a special case), 294 /// unioning together all possible optimized (or rewritten) forms 295 /// of this expression into an eclass and returning the `Value` 296 /// that represents that eclass. 297 fn optimize_pure_enode(&mut self, inst: Inst) -> Value { 298 // A pure node always has exactly one result. 299 let orig_value = self.func.dfg.first_result(inst); 300 301 let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_values); 302 let (ctx, optimized_values) = guard.get(); 303 304 // Limit rewrite depth. When we apply optimization rules, they 305 // may create new nodes (values) and those are, recursively, 306 // optimized eagerly as soon as they are created. So we may 307 // have more than one ISLE invocation on the stack. (This is 308 // necessary so that as the toplevel builds the 309 // right-hand-side expression bottom-up, it uses the "latest" 310 // optimized values for all the constituent parts.) To avoid 311 // infinite or problematic recursion, we bound the rewrite 312 // depth to a small constant here. 313 const REWRITE_LIMIT: usize = 5; 314 if ctx.rewrite_depth > REWRITE_LIMIT { 315 ctx.stats.rewrite_depth_limit += 1; 316 return orig_value; 317 } 318 ctx.rewrite_depth += 1; 319 trace!("Incrementing rewrite depth; now {}", ctx.rewrite_depth); 320 321 // Invoke the ISLE toplevel constructor, getting all new 322 // values produced as equivalents to this value. 323 trace!("Calling into ISLE with original value {}", orig_value); 324 ctx.stats.rewrite_rule_invoked += 1; 325 debug_assert!(optimized_values.is_empty()); 326 crate::opts::generated_code::constructor_simplify( 327 &mut IsleContext { ctx }, 328 orig_value, 329 optimized_values, 330 ); 331 332 ctx.stats.rewrite_rule_results += optimized_values.len() as u64; 333 334 // It's not supposed to matter what order `simplify` returns values in. 335 ctx.ctrl_plane.shuffle(optimized_values); 336 337 let num_matches = optimized_values.len(); 338 if num_matches > MATCHES_LIMIT { 339 trace!( 340 "Reached maximum matches limit; too many optimized values \ 341 ({num_matches} > {MATCHES_LIMIT}); ignoring rest.", 342 ); 343 optimized_values.truncate(MATCHES_LIMIT); 344 } 345 346 // Sort and deduplicate optimized values, in case multiple 347 // rules produced the same simplification. 348 optimized_values.sort_unstable(); 349 optimized_values.dedup(); 350 351 trace!(" -> returned from ISLE: {orig_value} -> {optimized_values:?}"); 352 353 // Construct a union-node tree representing the new eclass 354 // that results from rewriting. If any returned value was 355 // marked "subsume", take only that value. Otherwise, 356 // sequentially build the chain over the original value and 357 // all returned values. 358 let result_value = if let Some(&subsuming_value) = optimized_values 359 .iter() 360 .find(|&value| ctx.subsume_values.contains(value)) 361 { 362 optimized_values.clear(); 363 ctx.stats.pure_inst_subsume += 1; 364 subsuming_value 365 } else { 366 let mut union_value = orig_value; 367 let mut eclass_size = ctx.eclass_size[orig_value] + 1; 368 for optimized_value in optimized_values.drain(..) { 369 trace!( 370 "Returned from ISLE for {}, got {:?}", 371 orig_value, optimized_value 372 ); 373 if optimized_value == orig_value { 374 trace!(" -> same as orig value; skipping"); 375 ctx.stats.pure_inst_rewrite_to_self += 1; 376 continue; 377 } 378 let rhs_eclass_size = ctx.eclass_size[optimized_value] + 1; 379 if usize::from(eclass_size) + usize::from(rhs_eclass_size) > ECLASS_ENODE_LIMIT { 380 trace!(" -> reached eclass size limit"); 381 ctx.stats.eclass_size_limit += 1; 382 break; 383 } 384 let old_union_value = union_value; 385 union_value = ctx.func.dfg.union(old_union_value, optimized_value); 386 eclass_size += rhs_eclass_size; 387 ctx.eclass_size[union_value] = eclass_size - 1; 388 ctx.stats.union += 1; 389 trace!(" -> union: now {}", union_value); 390 ctx.func.dfg.merge_facts(old_union_value, optimized_value); 391 ctx.available_block[union_value] = 392 ctx.merge_availability(old_union_value, optimized_value); 393 } 394 union_value 395 }; 396 397 ctx.rewrite_depth -= 1; 398 trace!("Decrementing rewrite depth; now {}", ctx.rewrite_depth); 399 if ctx.rewrite_depth == 0 { 400 ctx.subsume_values.clear(); 401 } 402 403 debug_assert!(ctx.optimized_values.is_empty()); 404 result_value 405 } 406 407 fn merge_availability(&self, a: Value, b: Value) -> Block { 408 let a = self.available_block[a]; 409 let b = self.available_block[b]; 410 if self.domtree.block_dominates(a, b) { 411 a 412 } else { 413 b 414 } 415 } 416 417 /// Optimize a "skeleton" instruction. 418 /// 419 /// Returns an optional command of how to continue processing the optimized 420 /// instruction (e.g. removing it or replacing it with a new instruction). 421 fn optimize_skeleton_inst( 422 &mut self, 423 inst: Inst, 424 block: Block, 425 ) -> Option<SkeletonInstSimplification> { 426 self.stats.skeleton_inst += 1; 427 428 // If we have a rewrite rule for this instruction, do that first, so 429 // that GVN and alias analysis only see simplified skeleton 430 // instructions. 431 if let Some(cmd) = self.simplify_skeleton_inst(inst) { 432 self.stats.skeleton_inst_simplified += 1; 433 return Some(cmd); 434 } 435 436 // First, can we try to deduplicate? We need to keep some copy 437 // of the instruction around because it's side-effecting, but 438 // we may be able to reuse an earlier instance of it. 439 if is_mergeable_for_egraph(self.func, inst) { 440 let result = self.func.dfg.inst_results(inst).get(0).copied(); 441 trace!(" -> mergeable side-effecting op {}", inst); 442 443 // Does this instruction already exist? If so, add entries to 444 // the value-map to rewrite uses of its results to the results 445 // of the original (existing) instruction. If not, optimize 446 // the new instruction. 447 // 448 // Note that the GVN map is scoped, which is important 449 // here: because effectful ops are not removed from the 450 // skeleton (`Layout`), we need to be mindful of whether 451 // our current position is dominated by an instance of the 452 // instruction. (See #5796 for details.) 453 let ty = self.func.dfg.ctrl_typevar(inst); 454 match self 455 .gvn_map 456 .entry(&NullCtx, (ty, self.func.dfg.insts[inst])) 457 { 458 ScopedEntry::Occupied(o) => { 459 let orig_result = *o.get(); 460 match (result, orig_result) { 461 (Some(result), Some(orig_result)) => { 462 // Hit in GVN map -- reuse value. 463 self.stats.skeleton_inst_gvn += 1; 464 self.value_to_opt_value[result] = orig_result; 465 self.available_block[result] = self.available_block[orig_result]; 466 trace!(" -> merges result {} to {}", result, orig_result); 467 } 468 (None, None) => { 469 // Hit in the GVN map, but the instruction doesn't 470 // produce results, only side effects. Nothing else 471 // to do here. 472 self.stats.skeleton_inst_gvn += 1; 473 trace!(" -> merges with dominating instruction"); 474 } 475 (_, _) => unreachable!(), 476 } 477 Some(SkeletonInstSimplification::Remove) 478 } 479 ScopedEntry::Vacant(v) => { 480 // Otherwise, insert it into the value-map. 481 if let Some(result) = result { 482 self.value_to_opt_value[result] = result; 483 self.available_block[result] = block; 484 } 485 v.insert(result); 486 trace!(" -> inserts as new (no GVN)"); 487 None 488 } 489 } 490 } 491 // Otherwise, if a load or store, process it with the alias 492 // analysis to see if we can optimize it (rewrite in terms of 493 // an earlier load or stored value). 494 else if let Some(new_result) = 495 self.alias_analysis 496 .process_inst(self.func, self.alias_analysis_state, inst) 497 { 498 self.stats.alias_analysis_removed += 1; 499 let result = self.func.dfg.first_result(inst); 500 trace!( 501 " -> inst {} has result {} replaced with {}", 502 inst, result, new_result 503 ); 504 self.value_to_opt_value[result] = new_result; 505 self.available_block[result] = self.available_block[new_result]; 506 self.func.dfg.merge_facts(result, new_result); 507 Some(SkeletonInstSimplification::Remove) 508 } 509 // Otherwise, generic side-effecting op -- always keep it, and 510 // set its results to identity-map to original values. 511 else { 512 // Set all results to identity-map to themselves 513 // in the value-to-opt-value map. 514 for &result in self.func.dfg.inst_results(inst) { 515 self.value_to_opt_value[result] = result; 516 self.available_block[result] = block; 517 } 518 None 519 } 520 } 521 522 /// Find the best simplification of the given skeleton instruction, if any, 523 /// by consulting our `simplify_skeleton` ISLE rules. 524 fn simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification> { 525 // We cannot currently simplify terminators, or simplify into 526 // terminators. Anything that could change the control-flow graph is off 527 // limits. 528 // 529 // Consider the following CLIF snippet: 530 // 531 // block0(v0: i64): 532 // v1 = iconst.i32 0 533 // trapz v1, user42 534 // v2 = load.i32 v0 535 // brif v1, block1, block2 536 // block1: 537 // return v2 538 // block2: 539 // v3 = iconst.i32 1 540 // v4 = iadd v2, v3 541 // return v4 542 // 543 // We would ideally like to perform simplifications like replacing the 544 // `trapz` with an unconditional `trap` and the conditional `brif` 545 // branch with an unconditional `jump`. Note, however, that blocks 546 // `block1` and `block2` are dominated by `block0` and therefore can and 547 // do use values defined in `block0`. This presents challenges: 548 // 549 // * If we replace the `brif` with a `jump`, then we've mutated the 550 // control-flow graph and removed that domination property. The uses 551 // of `v2` and `v3` in those blocks become invalid. 552 // 553 // * Even worse, if we turn the `trapz` into a `trap`, we are 554 // introducing a terminator into the middle of the block, which leaves 555 // us with two choices to fix up the IR so that there aren't any 556 // instructions following the terminator in the block: 557 // 558 // 1. We can split the unreachable instructions off into a new 559 // block. However, there is no control-flow edge from the current 560 // block to this new block and so, again, the new block isn't 561 // dominated by the current block, and therefore the can't use 562 // values defined in this block or any dominating it. The `load` 563 // instruction uses `v0` but is not dominated by `v0`'s 564 // definition. 565 // 566 // 2. Alternatively, we can simply delete the trailing instructions, 567 // since they are unreachable. But then not only are the old 568 // instructions' uses no longer dominated by their definitions, but 569 // the definitions do not exist at all anymore! 570 // 571 // Whatever approach we would take, we would invalidate value uses, and 572 // would need to track and fix them up. 573 if self.func.dfg.insts[inst].opcode().is_branch() { 574 return None; 575 } 576 577 let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_insts); 578 let (ctx, optimized_insts) = guard.get(); 579 580 crate::opts::generated_code::constructor_simplify_skeleton( 581 &mut IsleContext { ctx }, 582 inst, 583 optimized_insts, 584 ); 585 586 let simplifications_len = optimized_insts.len(); 587 log::trace!(" -> simplify_skeleton: yielded {simplifications_len} simplification(s)"); 588 if simplifications_len > MATCHES_LIMIT { 589 log::trace!(" too many candidate simplifications; truncating to {MATCHES_LIMIT}"); 590 optimized_insts.truncate(MATCHES_LIMIT); 591 } 592 593 // Find the best simplification, if any, from our candidates. 594 // 595 // Unlike simplifying pure values, we do not add side-effectful 596 // instructions to the egraph, nor do we extract the best version via 597 // dynamic programming and considering the costs of operands. Instead, 598 // we greedily choose the best simplification. This is because there is 599 // an impedance mismatch: the egraph and our pure rewrites are centered 600 // around *values*, but we don't represent side-effects with values, we 601 // represent them implicitly in their *instructions*. 602 // 603 // The initial best choice is "no simplification, just use the original 604 // instruction" which has the original instruction's cost. 605 let mut best = None; 606 let mut best_cost = cost::Cost::of_skeleton_op( 607 ctx.func.dfg.insts[inst].opcode(), 608 ctx.func.dfg.inst_args(inst).len(), 609 ); 610 while let Some(simplification) = optimized_insts.pop() { 611 let (new_inst, new_val) = match simplification { 612 // We can't do better than completely removing the skeleton 613 // instruction, so short-cicuit the loop and eagerly return the 614 // `Remove*` simplifications. 615 SkeletonInstSimplification::Remove => { 616 log::trace!(" -> simplify_skeleton: remove inst"); 617 debug_assert!(ctx.func.dfg.inst_results(inst).is_empty()); 618 return Some(simplification); 619 } 620 SkeletonInstSimplification::RemoveWithVal { val } => { 621 log::trace!(" -> simplify_skeleton: remove inst and use {val} as its result"); 622 if cfg!(debug_assertions) { 623 let results = ctx.func.dfg.inst_results(inst); 624 debug_assert_eq!(results.len(), 1); 625 debug_assert_eq!( 626 ctx.func.dfg.value_type(results[0]), 627 ctx.func.dfg.value_type(val), 628 ); 629 } 630 return Some(simplification); 631 } 632 633 // For instruction replacement simplification, we want to check 634 // that the replacements define the same number and types of 635 // values as the original instruction, and also determine 636 // whether they are actually an improvement over (i.e. have 637 // lower cost than) the original instruction. 638 SkeletonInstSimplification::Replace { inst } => { 639 log::trace!( 640 " -> simplify_skeleton: replace inst with {inst}: {}", 641 ctx.func.dfg.display_inst(inst) 642 ); 643 (inst, None) 644 } 645 SkeletonInstSimplification::ReplaceWithVal { inst, val } => { 646 log::trace!( 647 " -> simplify_skeleton: replace inst with {val} and {inst}: {}", 648 ctx.func.dfg.display_inst(inst) 649 ); 650 (inst, Some(val)) 651 } 652 }; 653 654 if cfg!(debug_assertions) { 655 let opcode = ctx.func.dfg.insts[inst].opcode(); 656 debug_assert!( 657 !(opcode.is_terminator() || opcode.is_branch()), 658 "simplifying control-flow instructions and terminators is not yet supported", 659 ); 660 661 let old_vals = ctx.func.dfg.inst_results(inst); 662 let new_vals = if let Some(val) = new_val.as_ref() { 663 core::slice::from_ref(val) 664 } else { 665 ctx.func.dfg.inst_results(new_inst) 666 }; 667 debug_assert_eq!( 668 old_vals.len(), 669 new_vals.len(), 670 "skeleton simplification should result in the same number of result values", 671 ); 672 673 for (old_val, new_val) in old_vals.iter().zip(new_vals) { 674 let old_ty = ctx.func.dfg.value_type(*old_val); 675 let new_ty = ctx.func.dfg.value_type(*new_val); 676 debug_assert_eq!( 677 old_ty, new_ty, 678 "skeleton simplification should result in values of the correct type", 679 ); 680 } 681 } 682 683 // Our best simplification is the one with the least cost. Update 684 // `best` if necessary. 685 let cost = cost::Cost::of_skeleton_op( 686 ctx.func.dfg.insts[new_inst].opcode(), 687 ctx.func.dfg.inst_args(new_inst).len(), 688 ); 689 if cost < best_cost { 690 best = Some(simplification); 691 best_cost = cost; 692 } 693 } 694 695 // Return the best simplification! 696 best 697 } 698 699 /// Helper to propagate facts on constant values: if PCC is 700 /// enabled, then unconditionally add a fact attesting to the 701 /// Value's concrete value. 702 fn attach_constant_fact(&mut self, inst: Inst, value: Value, ty: Type) { 703 if self.flags.enable_pcc() { 704 if let InstructionData::UnaryImm { 705 opcode: Opcode::Iconst, 706 imm, 707 } = self.func.dfg.insts[inst] 708 { 709 let imm: i64 = imm.into(); 710 self.func.dfg.facts[value] = 711 Some(Fact::constant(ty.bits().try_into().unwrap(), imm as u64)); 712 } 713 } 714 } 715 } 716 717 impl<'a> EgraphPass<'a> { 718 /// Create a new EgraphPass. 719 pub fn new( 720 func: &'a mut Function, 721 domtree: &'a DominatorTree, 722 loop_analysis: &'a LoopAnalysis, 723 alias_analysis: &'a mut AliasAnalysis<'a>, 724 flags: &'a Flags, 725 ctrl_plane: &'a mut ControlPlane, 726 ) -> Self { 727 Self { 728 func, 729 domtree, 730 loop_analysis, 731 alias_analysis, 732 flags, 733 ctrl_plane, 734 stats: Stats::default(), 735 remat_values: FxHashSet::default(), 736 } 737 } 738 739 /// Run the process. 740 pub fn run(&mut self) { 741 self.remove_pure_and_optimize(); 742 743 trace!("egraph built:\n{}\n", self.func.display()); 744 if cfg!(feature = "trace-log") { 745 for (value, def) in self.func.dfg.values_and_defs() { 746 trace!(" -> {} = {:?}", value, def); 747 match def { 748 ValueDef::Result(i, 0) => { 749 trace!(" -> {} = {:?}", i, self.func.dfg.insts[i]); 750 } 751 _ => {} 752 } 753 } 754 } 755 756 self.elaborate(); 757 758 log::trace!("stats: {:#?}", self.stats); 759 } 760 761 /// Remove pure nodes from the `Layout` of the function, ensuring 762 /// that only the "side-effect skeleton" remains, and also 763 /// optimize the pure nodes. This is the first step of 764 /// egraph-based processing and turns the pure CFG-based CLIF into 765 /// a CFG skeleton with a sea of (optimized) nodes tying it 766 /// together. 767 /// 768 /// As we walk through the code, we eagerly apply optimization 769 /// rules; at any given point we have a "latest version" of an 770 /// eclass of possible representations for a `Value` in the 771 /// original program, which is itself a `Value` at the root of a 772 /// union-tree. We keep a map from the original values to these 773 /// optimized values. When we encounter any instruction (pure or 774 /// side-effecting skeleton) we rewrite its arguments to capture 775 /// the "latest" optimized forms of these values. (We need to do 776 /// this as part of this pass, and not later using a finished map, 777 /// because the eclass can continue to be updated and we need to 778 /// only refer to its subset that exists at this stage, to 779 /// maintain acyclicity.) 780 fn remove_pure_and_optimize(&mut self) { 781 let mut cursor = FuncCursor::new(self.func); 782 let mut value_to_opt_value: SecondaryMap<Value, Value> = 783 SecondaryMap::with_default(Value::reserved_value()); 784 785 // Map from instruction to value for hash-consing of pure ops 786 // into the egraph. This can be a standard (non-scoped) 787 // hashmap because pure ops have no location: they are 788 // "outside of" control flow. 789 // 790 // Note also that we keep the controlling typevar (the `Type` 791 // in the tuple below) because it may disambiguate 792 // instructions that are identical except for type. 793 // 794 // We store both skeleton and non-skeleton instructions in the 795 // GVN map; for skeleton instructions, we only store those 796 // that are idempotent, i.e., still eligible to GVN. Note that 797 // some skeleton instructions are idempotent but do not 798 // produce a value: e.g., traps on a given condition. To allow 799 // for both cases, we store an `Option<Value>` as the value in 800 // this map. 801 let mut gvn_map: ScopedHashMap<(Type, InstructionData), Option<Value>> = 802 ScopedHashMap::with_capacity(cursor.func.dfg.num_values()); 803 804 // The block in the domtree preorder traversal at each level 805 // of the GVN map. 806 let mut gvn_map_blocks: Vec<Block> = vec![]; 807 808 // To get the best possible merging and canonicalization, we 809 // track where a value is "available" at: this is the 810 // domtree-nearest-ancestor join of all args if the value 811 // itself is pure, otherwise the block where the value is 812 // defined. (And for union nodes, the 813 // domtree-highest-ancestor, i.e., the meet or the dual to the 814 // above join.) 815 let mut available_block: SecondaryMap<Value, Block> = 816 SecondaryMap::with_default(Block::reserved_value()); 817 818 // To avoid blowing up eclasses too much, we track the size of 819 // each eclass reachable by a tree of union nodes from a given 820 // value ID, and we avoid union'ing additional values into an 821 // eclass when it reaches `ECLASS_ENODE_LIMIT`. 822 // 823 // For efficiency, this encodes size minus one: so a value of 824 // zero (which is cheap to bulk-initialize) means a singleton 825 // eclass of size one. This also allows us to avoid explicitly 826 // writing the size for any values that are not union nodes. 827 let mut eclass_size: SecondaryMap<Value, u8> = SecondaryMap::with_default(0); 828 829 // This is an initial guess at the size we'll need, but we add 830 // more values as we build simplified alternative expressions so 831 // this is likely to realloc again later. 832 available_block.resize(cursor.func.dfg.num_values()); 833 834 // In domtree preorder, visit blocks. (TODO: factor out an 835 // iterator from this and elaborator.) 836 let root = cursor.layout().entry_block().unwrap(); 837 enum StackEntry { 838 Visit(Block), 839 Pop, 840 } 841 let mut block_stack = vec![StackEntry::Visit(root)]; 842 while let Some(entry) = block_stack.pop() { 843 match entry { 844 StackEntry::Visit(block) => { 845 // We popped this block; push children 846 // immediately, then process this block. 847 block_stack.push(StackEntry::Pop); 848 block_stack.extend( 849 self.ctrl_plane 850 .shuffled(self.domtree.children(block)) 851 .map(StackEntry::Visit), 852 ); 853 gvn_map.increment_depth(); 854 gvn_map_blocks.push(block); 855 856 trace!("Processing block {}", block); 857 cursor.set_position(CursorPosition::Before(block)); 858 859 let mut alias_analysis_state = self.alias_analysis.block_starting_state(block); 860 861 for ¶m in cursor.func.dfg.block_params(block) { 862 trace!("creating initial singleton eclass for blockparam {}", param); 863 value_to_opt_value[param] = param; 864 available_block[param] = block; 865 } 866 while let Some(inst) = cursor.next_inst() { 867 trace!( 868 "Processing inst {inst}: {}", 869 cursor.func.dfg.display_inst(inst), 870 ); 871 872 // Rewrite args of *all* instructions using the 873 // value-to-opt-value map. 874 cursor.func.dfg.map_inst_values(inst, |arg| { 875 let new_value = value_to_opt_value[arg]; 876 trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value); 877 debug_assert_ne!(new_value, Value::reserved_value()); 878 new_value 879 }); 880 881 // Build a context for optimization, with borrows of 882 // state. We can't invoke a method on `self` because 883 // we've borrowed `self.func` mutably (as 884 // `cursor.func`) so we pull apart the pieces instead 885 // here. 886 let mut ctx = OptimizeCtx { 887 func: cursor.func, 888 value_to_opt_value: &mut value_to_opt_value, 889 gvn_map: &mut gvn_map, 890 gvn_map_blocks: &mut gvn_map_blocks, 891 available_block: &mut available_block, 892 eclass_size: &mut eclass_size, 893 rewrite_depth: 0, 894 subsume_values: FxHashSet::default(), 895 remat_values: &mut self.remat_values, 896 stats: &mut self.stats, 897 domtree: &self.domtree, 898 alias_analysis: self.alias_analysis, 899 alias_analysis_state: &mut alias_analysis_state, 900 flags: self.flags, 901 ctrl_plane: self.ctrl_plane, 902 optimized_values: Default::default(), 903 optimized_insts: Default::default(), 904 }; 905 906 if is_pure_for_egraph(ctx.func, inst) { 907 // Insert into GVN map and optimize any new nodes 908 // inserted (recursively performing this work for 909 // any nodes the optimization rules produce). 910 let inst = NewOrExistingInst::Existing(inst); 911 ctx.insert_pure_enode(inst); 912 // We've now rewritten all uses, or will when we 913 // see them, and the instruction exists as a pure 914 // enode in the eclass, so we can remove it. 915 cursor.remove_inst_and_step_back(); 916 } else { 917 if let Some(cmd) = ctx.optimize_skeleton_inst(inst, block) { 918 Self::execute_skeleton_inst_simplification( 919 cmd, 920 &mut cursor, 921 &mut value_to_opt_value, 922 inst, 923 ); 924 } 925 } 926 } 927 } 928 StackEntry::Pop => { 929 gvn_map.decrement_depth(); 930 gvn_map_blocks.pop(); 931 } 932 } 933 } 934 } 935 936 /// Execute a simplification of an instruction in the side-effectful 937 /// skeleton. 938 fn execute_skeleton_inst_simplification( 939 simplification: SkeletonInstSimplification, 940 cursor: &mut FuncCursor, 941 value_to_opt_value: &mut SecondaryMap<Value, Value>, 942 old_inst: Inst, 943 ) { 944 let mut forward_val = |cursor: &mut FuncCursor, old_val, new_val| { 945 cursor.func.dfg.change_to_alias(old_val, new_val); 946 value_to_opt_value[old_val] = new_val; 947 }; 948 949 let (new_inst, new_val) = match simplification { 950 SkeletonInstSimplification::Remove => { 951 cursor.remove_inst_and_step_back(); 952 return; 953 } 954 SkeletonInstSimplification::RemoveWithVal { val } => { 955 cursor.remove_inst_and_step_back(); 956 let old_val = cursor.func.dfg.first_result(old_inst); 957 cursor.func.dfg.detach_inst_results(old_inst); 958 forward_val(cursor, old_val, val); 959 return; 960 } 961 SkeletonInstSimplification::Replace { inst } => (inst, None), 962 SkeletonInstSimplification::ReplaceWithVal { inst, val } => (inst, Some(val)), 963 }; 964 965 // Replace the old instruction with the new one. 966 cursor.replace_inst(new_inst); 967 debug_assert!(!cursor.func.dfg.insts[new_inst].opcode().is_terminator()); 968 969 // Redirect the old instruction's result values to our new instruction's 970 // result values. 971 let mut i = 0; 972 let mut next_new_val = |dfg: &crate::ir::DataFlowGraph| -> Value { 973 if let Some(val) = new_val { 974 val 975 } else { 976 let val = dfg.inst_results(new_inst)[i]; 977 i += 1; 978 val 979 } 980 }; 981 for i in 0..cursor.func.dfg.inst_results(old_inst).len() { 982 let old_val = cursor.func.dfg.inst_results(old_inst)[i]; 983 let new_val = next_new_val(&cursor.func.dfg); 984 forward_val(cursor, old_val, new_val); 985 } 986 987 // Back up so that the next iteration of the outer egraph loop will 988 // process the new instruction. 989 cursor.goto_inst(new_inst); 990 cursor.prev_inst(); 991 } 992 993 /// Scoped elaboration: compute a final ordering of op computation 994 /// for each block and update the given Func body. After this 995 /// runs, the function body is back into the state where every 996 /// Inst with an used result is placed in the layout (possibly 997 /// duplicated, if our code-motion logic decides this is the best 998 /// option). 999 /// 1000 /// This works in concert with the domtree. We do a preorder 1001 /// traversal of the domtree, tracking a scoped map from Id to 1002 /// (new) Value. The map's scopes correspond to levels in the 1003 /// domtree. 1004 /// 1005 /// At each block, we iterate forward over the side-effecting 1006 /// eclasses, and recursively generate their arg eclasses, then 1007 /// emit the ops themselves. 1008 /// 1009 /// To use an eclass in a given block, we first look it up in the 1010 /// scoped map, and get the Value if already present. If not, we 1011 /// need to generate it. We emit the extracted enode for this 1012 /// eclass after recursively generating its args. Eclasses are 1013 /// thus computed "as late as possible", but then memoized into 1014 /// the Id-to-Value map and available to all dominated blocks and 1015 /// for the rest of this block. (This subsumes GVN.) 1016 fn elaborate(&mut self) { 1017 let mut elaborator = Elaborator::new( 1018 self.func, 1019 &self.domtree, 1020 self.loop_analysis, 1021 &self.remat_values, 1022 &mut self.stats, 1023 self.ctrl_plane, 1024 ); 1025 elaborator.elaborate(); 1026 1027 self.check_post_egraph(); 1028 } 1029 1030 #[cfg(debug_assertions)] 1031 fn check_post_egraph(&self) { 1032 // Verify that no union nodes are reachable from inst args, 1033 // and that all inst args' defining instructions are in the 1034 // layout. 1035 for block in self.func.layout.blocks() { 1036 for inst in self.func.layout.block_insts(block) { 1037 self.func 1038 .dfg 1039 .inst_values(inst) 1040 .for_each(|arg| match self.func.dfg.value_def(arg) { 1041 ValueDef::Result(i, _) => { 1042 debug_assert!(self.func.layout.inst_block(i).is_some()); 1043 } 1044 ValueDef::Union(..) => { 1045 panic!("egraph union node {arg} still reachable at {inst}!"); 1046 } 1047 _ => {} 1048 }) 1049 } 1050 } 1051 } 1052 1053 #[cfg(not(debug_assertions))] 1054 fn check_post_egraph(&self) {} 1055 } 1056 1057 /// Implementation of external-context equality and hashing on 1058 /// InstructionData. This allows us to deduplicate instructions given 1059 /// some context that lets us see its value lists, so we don't need to 1060 /// store arguments inline in the `InstructionData` (or alongside it in 1061 /// some newly-defined key type) in all cases. 1062 struct GVNContext<'a> { 1063 value_lists: &'a ValueListPool, 1064 } 1065 1066 impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> { 1067 fn ctx_eq( 1068 &self, 1069 (a_ty, a_inst): &(Type, InstructionData), 1070 (b_ty, b_inst): &(Type, InstructionData), 1071 ) -> bool { 1072 a_ty == b_ty && a_inst.eq(b_inst, self.value_lists) 1073 } 1074 } 1075 1076 impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> { 1077 fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) { 1078 core::hash::Hash::hash(&ty, state); 1079 inst.hash(state, self.value_lists); 1080 } 1081 } 1082 1083 /// Statistics collected during egraph-based processing. 1084 #[derive(Clone, Debug, Default)] 1085 pub(crate) struct Stats { 1086 pub(crate) pure_inst: u64, 1087 pub(crate) pure_inst_deduped: u64, 1088 pub(crate) pure_inst_subsume: u64, 1089 pub(crate) pure_inst_rewrite_to_self: u64, 1090 pub(crate) pure_inst_insert_orig: u64, 1091 pub(crate) pure_inst_insert_new: u64, 1092 pub(crate) skeleton_inst: u64, 1093 pub(crate) skeleton_inst_simplified: u64, 1094 pub(crate) skeleton_inst_gvn: u64, 1095 pub(crate) alias_analysis_removed: u64, 1096 pub(crate) new_inst: u64, 1097 pub(crate) union: u64, 1098 pub(crate) subsume: u64, 1099 pub(crate) remat: u64, 1100 pub(crate) rewrite_rule_invoked: u64, 1101 pub(crate) rewrite_rule_results: u64, 1102 pub(crate) rewrite_depth_limit: u64, 1103 pub(crate) elaborate_visit_node: u64, 1104 pub(crate) elaborate_memoize_hit: u64, 1105 pub(crate) elaborate_memoize_miss: u64, 1106 pub(crate) elaborate_remat: u64, 1107 pub(crate) elaborate_licm_hoist: u64, 1108 pub(crate) elaborate_func: u64, 1109 pub(crate) elaborate_func_pre_insts: u64, 1110 pub(crate) elaborate_func_post_insts: u64, 1111 pub(crate) eclass_size_limit: u64, 1112 } 1113