1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions do not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/DenseMap.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineDominators.h" 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/MachineFunctionPass.h" 19 #include "llvm/CodeGen/MachineInstrBuilder.h" 20 #include "llvm/CodeGen/MachineLoopInfo.h" 21 #include "llvm/CodeGen/MachineRegisterInfo.h" 22 #include "llvm/CodeGen/MachineTraceMetrics.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetSchedule.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include "llvm/Target/TargetInstrInfo.h" 28 #include "llvm/Target/TargetRegisterInfo.h" 29 #include "llvm/Target/TargetSubtargetInfo.h" 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "machine-combiner" 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineLoopInfo *MLI; // Current MachineLoopInfo 44 MachineTraceMetrics *Traces; 45 MachineTraceMetrics::Ensemble *MinInstr; 46 47 TargetSchedModel TSchedModel; 48 49 /// True if optimizing for code size. 50 bool OptSize; 51 52 public: 53 static char ID; 54 MachineCombiner() : MachineFunctionPass(ID) { 55 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 56 } 57 void getAnalysisUsage(AnalysisUsage &AU) const override; 58 bool runOnMachineFunction(MachineFunction &MF) override; 59 StringRef getPassName() const override { return "Machine InstCombiner"; } 60 61 private: 62 bool doSubstitute(unsigned NewSize, unsigned OldSize); 63 bool combineInstructions(MachineBasicBlock *); 64 MachineInstr *getOperandDef(const MachineOperand &MO); 65 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 66 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 67 MachineTraceMetrics::Trace BlockTrace); 68 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 69 MachineTraceMetrics::Trace BlockTrace); 70 bool 71 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 72 MachineTraceMetrics::Trace BlockTrace, 73 SmallVectorImpl<MachineInstr *> &InsInstrs, 74 SmallVectorImpl<MachineInstr *> &DelInstrs, 75 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 76 MachineCombinerPattern Pattern); 77 bool preservesResourceLen(MachineBasicBlock *MBB, 78 MachineTraceMetrics::Trace BlockTrace, 79 SmallVectorImpl<MachineInstr *> &InsInstrs, 80 SmallVectorImpl<MachineInstr *> &DelInstrs); 81 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 82 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 83 }; 84 } 85 86 char MachineCombiner::ID = 0; 87 char &llvm::MachineCombinerID = MachineCombiner::ID; 88 89 INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, 90 "Machine InstCombiner", false, false) 91 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 92 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 93 INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", 94 false, false) 95 96 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 97 AU.setPreservesCFG(); 98 AU.addPreserved<MachineDominatorTree>(); 99 AU.addRequired<MachineLoopInfo>(); 100 AU.addPreserved<MachineLoopInfo>(); 101 AU.addRequired<MachineTraceMetrics>(); 102 AU.addPreserved<MachineTraceMetrics>(); 103 MachineFunctionPass::getAnalysisUsage(AU); 104 } 105 106 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 107 MachineInstr *DefInstr = nullptr; 108 // We need a virtual register definition. 109 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 110 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 111 // PHI's have no depth etc. 112 if (DefInstr && DefInstr->isPHI()) 113 DefInstr = nullptr; 114 return DefInstr; 115 } 116 117 /// Computes depth of instructions in vector \InsInstr. 118 /// 119 /// \param InsInstrs is a vector of machine instructions 120 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 121 /// of defining machine instruction in \p InsInstrs 122 /// \param BlockTrace is a trace of machine instructions 123 /// 124 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 125 unsigned 126 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 127 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 128 MachineTraceMetrics::Trace BlockTrace) { 129 SmallVector<unsigned, 16> InstrDepth; 130 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 131 "Missing machine model\n"); 132 133 // For each instruction in the new sequence compute the depth based on the 134 // operands. Use the trace information when possible. For new operands which 135 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 136 for (auto *InstrPtr : InsInstrs) { // for each Use 137 unsigned IDepth = 0; 138 DEBUG(dbgs() << "NEW INSTR "; 139 InstrPtr->print(dbgs(), TII); 140 dbgs() << "\n";); 141 for (const MachineOperand &MO : InstrPtr->operands()) { 142 // Check for virtual register operand. 143 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 144 continue; 145 if (!MO.isUse()) 146 continue; 147 unsigned DepthOp = 0; 148 unsigned LatencyOp = 0; 149 DenseMap<unsigned, unsigned>::iterator II = 150 InstrIdxForVirtReg.find(MO.getReg()); 151 if (II != InstrIdxForVirtReg.end()) { 152 // Operand is new virtual register not in trace 153 assert(II->second < InstrDepth.size() && "Bad Index"); 154 MachineInstr *DefInstr = InsInstrs[II->second]; 155 assert(DefInstr && 156 "There must be a definition for a new virtual register"); 157 DepthOp = InstrDepth[II->second]; 158 LatencyOp = TSchedModel.computeOperandLatency( 159 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 160 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 161 } else { 162 MachineInstr *DefInstr = getOperandDef(MO); 163 if (DefInstr) { 164 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; 165 LatencyOp = TSchedModel.computeOperandLatency( 166 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 167 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 168 } 169 } 170 IDepth = std::max(IDepth, DepthOp + LatencyOp); 171 } 172 InstrDepth.push_back(IDepth); 173 } 174 unsigned NewRootIdx = InsInstrs.size() - 1; 175 return InstrDepth[NewRootIdx]; 176 } 177 178 /// Computes instruction latency as max of latency of defined operands. 179 /// 180 /// \param Root is a machine instruction that could be replaced by NewRoot. 181 /// It is used to compute a more accurate latency information for NewRoot in 182 /// case there is a dependent instruction in the same trace (\p BlockTrace) 183 /// \param NewRoot is the instruction for which the latency is computed 184 /// \param BlockTrace is a trace of machine instructions 185 /// 186 /// \returns Latency of \p NewRoot 187 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 188 MachineTraceMetrics::Trace BlockTrace) { 189 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 190 "Missing machine model\n"); 191 192 // Check each definition in NewRoot and compute the latency 193 unsigned NewRootLatency = 0; 194 195 for (const MachineOperand &MO : NewRoot->operands()) { 196 // Check for virtual register operand. 197 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 198 continue; 199 if (!MO.isDef()) 200 continue; 201 // Get the first instruction that uses MO 202 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 203 RI++; 204 MachineInstr *UseMO = RI->getParent(); 205 unsigned LatencyOp = 0; 206 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { 207 LatencyOp = TSchedModel.computeOperandLatency( 208 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 209 UseMO->findRegisterUseOperandIdx(MO.getReg())); 210 } else { 211 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 212 } 213 NewRootLatency = std::max(NewRootLatency, LatencyOp); 214 } 215 return NewRootLatency; 216 } 217 218 /// The combiner's goal may differ based on which pattern it is attempting 219 /// to optimize. 220 enum class CombinerObjective { 221 MustReduceDepth, // The data dependency chain must be improved. 222 Default // The critical path must not be lengthened. 223 }; 224 225 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 226 // TODO: If C++ ever gets a real enum class, make this part of the 227 // MachineCombinerPattern class. 228 switch (P) { 229 case MachineCombinerPattern::REASSOC_AX_BY: 230 case MachineCombinerPattern::REASSOC_AX_YB: 231 case MachineCombinerPattern::REASSOC_XA_BY: 232 case MachineCombinerPattern::REASSOC_XA_YB: 233 return CombinerObjective::MustReduceDepth; 234 default: 235 return CombinerObjective::Default; 236 } 237 } 238 239 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 240 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 241 /// sequence to replace the old sequence is that it cannot lengthen the critical 242 /// path. The definition of "improve" may be restricted by specifying that the 243 /// new path improves the data dependency chain (MustReduceDepth). 244 bool MachineCombiner::improvesCriticalPathLen( 245 MachineBasicBlock *MBB, MachineInstr *Root, 246 MachineTraceMetrics::Trace BlockTrace, 247 SmallVectorImpl<MachineInstr *> &InsInstrs, 248 SmallVectorImpl<MachineInstr *> &DelInstrs, 249 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 250 MachineCombinerPattern Pattern) { 251 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 252 "Missing machine model\n"); 253 // NewRoot is the last instruction in the \p InsInstrs vector. 254 unsigned NewRootIdx = InsInstrs.size() - 1; 255 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 256 257 // Get depth and latency of NewRoot and Root. 258 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 259 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; 260 261 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 262 dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; 263 dbgs() << " RootDepth: " << RootDepth << "\n"); 264 265 // For a transform such as reassociation, the cost equation is 266 // conservatively calculated so that we must improve the depth (data 267 // dependency cycles) in the critical path to proceed with the transform. 268 // Being conservative also protects against inaccuracies in the underlying 269 // machine trace metrics and CPU models. 270 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) 271 return NewRootDepth < RootDepth; 272 273 // A more flexible cost calculation for the critical path includes the slack 274 // of the original code sequence. This may allow the transform to proceed 275 // even if the instruction depths (data dependency cycles) become worse. 276 277 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 278 unsigned RootLatency = 0; 279 280 for (auto I : DelInstrs) 281 RootLatency += TSchedModel.computeInstrLatency(I); 282 283 unsigned RootSlack = BlockTrace.getInstrSlack(*Root); 284 285 DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; 286 dbgs() << " RootLatency: " << RootLatency << "\n"; 287 dbgs() << " RootSlack: " << RootSlack << "\n"; 288 dbgs() << " NewRootDepth + NewRootLatency = " 289 << NewRootDepth + NewRootLatency << "\n"; 290 dbgs() << " RootDepth + RootLatency + RootSlack = " 291 << RootDepth + RootLatency + RootSlack << "\n";); 292 293 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 294 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 295 296 return NewCycleCount <= OldCycleCount; 297 } 298 299 /// helper routine to convert instructions into SC 300 void MachineCombiner::instr2instrSC( 301 SmallVectorImpl<MachineInstr *> &Instrs, 302 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 303 for (auto *InstrPtr : Instrs) { 304 unsigned Opc = InstrPtr->getOpcode(); 305 unsigned Idx = TII->get(Opc).getSchedClass(); 306 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 307 InstrsSC.push_back(SC); 308 } 309 } 310 311 /// True when the new instructions do not increase resource length 312 bool MachineCombiner::preservesResourceLen( 313 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 314 SmallVectorImpl<MachineInstr *> &InsInstrs, 315 SmallVectorImpl<MachineInstr *> &DelInstrs) { 316 if (!TSchedModel.hasInstrSchedModel()) 317 return true; 318 319 // Compute current resource length 320 321 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 322 SmallVector <const MachineBasicBlock *, 1> MBBarr; 323 MBBarr.push_back(MBB); 324 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 325 326 // Deal with SC rather than Instructions. 327 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 328 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 329 330 instr2instrSC(InsInstrs, InsInstrsSC); 331 instr2instrSC(DelInstrs, DelInstrsSC); 332 333 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 334 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 335 336 // Compute new resource length. 337 unsigned ResLenAfterCombine = 338 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 339 340 DEBUG(dbgs() << "RESOURCE DATA: \n"; 341 dbgs() << " resource len before: " << ResLenBeforeCombine 342 << " after: " << ResLenAfterCombine << "\n";); 343 344 return ResLenAfterCombine <= ResLenBeforeCombine; 345 } 346 347 /// \returns true when new instruction sequence should be generated 348 /// independent if it lengthens critical path or not 349 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 350 if (OptSize && (NewSize < OldSize)) 351 return true; 352 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 353 return true; 354 return false; 355 } 356 357 static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, 358 SmallVector<MachineInstr *, 16> InsInstrs, 359 SmallVector<MachineInstr *, 16> DelInstrs, 360 MachineTraceMetrics *Traces) { 361 for (auto *InstrPtr : InsInstrs) 362 MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); 363 for (auto *InstrPtr : DelInstrs) 364 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 365 ++NumInstCombined; 366 Traces->invalidate(MBB); 367 Traces->verifyAnalysis(); 368 } 369 370 /// Substitute a slow code sequence with a faster one by 371 /// evaluating instruction combining pattern. 372 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 373 /// combining based on machine trace metrics. Only combine a sequence of 374 /// instructions when this neither lengthens the critical path nor increases 375 /// resource pressure. When optimizing for codesize always combine when the new 376 /// sequence is shorter. 377 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 378 bool Changed = false; 379 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 380 381 auto BlockIter = MBB->begin(); 382 // Check if the block is in a loop. 383 const MachineLoop *ML = MLI->getLoopFor(MBB); 384 385 while (BlockIter != MBB->end()) { 386 auto &MI = *BlockIter++; 387 388 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 389 SmallVector<MachineCombinerPattern, 16> Patterns; 390 // The motivating example is: 391 // 392 // MUL Other MUL_op1 MUL_op2 Other 393 // \ / \ | / 394 // ADD/SUB => MADD/MSUB 395 // (=Root) (=NewRoot) 396 397 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 398 // usually beneficial for code size it unfortunately can hurt performance 399 // when the ADD is on the critical path, but the MUL is not. With the 400 // substitution the MUL becomes part of the critical path (in form of the 401 // MADD) and can lengthen it on architectures where the MADD latency is 402 // longer than the ADD latency. 403 // 404 // For each instruction we check if it can be the root of a combiner 405 // pattern. Then for each pattern the new code sequence in form of MI is 406 // generated and evaluated. When the efficiency criteria (don't lengthen 407 // critical path, don't use more resources) is met the new sequence gets 408 // hooked up into the basic block before the old sequence is removed. 409 // 410 // The algorithm does not try to evaluate all patterns and pick the best. 411 // This is only an artificial restriction though. In practice there is 412 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 413 // based on an internal cost heuristic. 414 415 if (!TII->getMachineCombinerPatterns(MI, Patterns)) 416 continue; 417 418 for (auto P : Patterns) { 419 SmallVector<MachineInstr *, 16> InsInstrs; 420 SmallVector<MachineInstr *, 16> DelInstrs; 421 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 422 if (!MinInstr) 423 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 424 Traces->verifyAnalysis(); 425 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 426 InstrIdxForVirtReg); 427 unsigned NewInstCount = InsInstrs.size(); 428 unsigned OldInstCount = DelInstrs.size(); 429 // Found pattern, but did not generate alternative sequence. 430 // This can happen e.g. when an immediate could not be materialized 431 // in a single instruction. 432 if (!NewInstCount) 433 continue; 434 435 bool SubstituteAlways = false; 436 if (ML && TII->isThroughputPattern(P)) 437 SubstituteAlways = true; 438 439 // Substitute when we optimize for codesize and the new sequence has 440 // fewer instructions OR 441 // the new sequence neither lengthens the critical path nor increases 442 // resource pressure. 443 if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { 444 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); 445 // Eagerly stop after the first pattern fires. 446 Changed = true; 447 break; 448 } else { 449 // Calculating the trace metrics may be expensive, 450 // so only do this when necessary. 451 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 452 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, 453 InstrIdxForVirtReg, P) && 454 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { 455 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); 456 // Eagerly stop after the first pattern fires. 457 Changed = true; 458 break; 459 } 460 // Cleanup instructions of the alternative code sequence. There is no 461 // use for them. 462 MachineFunction *MF = MBB->getParent(); 463 for (auto *InstrPtr : InsInstrs) 464 MF->DeleteMachineInstr(InstrPtr); 465 } 466 InstrIdxForVirtReg.clear(); 467 } 468 } 469 470 return Changed; 471 } 472 473 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 474 const TargetSubtargetInfo &STI = MF.getSubtarget(); 475 TII = STI.getInstrInfo(); 476 TRI = STI.getRegisterInfo(); 477 SchedModel = STI.getSchedModel(); 478 TSchedModel.init(SchedModel, &STI, TII); 479 MRI = &MF.getRegInfo(); 480 MLI = &getAnalysis<MachineLoopInfo>(); 481 Traces = &getAnalysis<MachineTraceMetrics>(); 482 MinInstr = nullptr; 483 OptSize = MF.getFunction()->optForSize(); 484 485 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 486 if (!TII->useMachineCombiner()) { 487 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 488 return false; 489 } 490 491 bool Changed = false; 492 493 // Try to combine instructions. 494 for (auto &MBB : MF) 495 Changed |= combineInstructions(&MBB); 496 497 return Changed; 498 } 499