1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 14 #define DEBUG_TYPE "machine-combiner" 15 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/CodeGen/MachineDominators.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/MachineFunctionPass.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineLoopInfo.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/MachineTraceMetrics.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/TargetSchedule.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineLoopInfo *MLI; // Current MachineLoopInfo 44 MachineTraceMetrics *Traces; 45 MachineTraceMetrics::Ensemble *MinInstr; 46 47 TargetSchedModel TSchedModel; 48 49 /// True if optimizing for code size. 50 bool OptSize; 51 52 public: 53 static char ID; 54 MachineCombiner() : MachineFunctionPass(ID) { 55 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 56 } 57 void getAnalysisUsage(AnalysisUsage &AU) const override; 58 bool runOnMachineFunction(MachineFunction &MF) override; 59 StringRef getPassName() const override { return "Machine InstCombiner"; } 60 61 private: 62 bool doSubstitute(unsigned NewSize, unsigned OldSize); 63 bool combineInstructions(MachineBasicBlock *); 64 MachineInstr *getOperandDef(const MachineOperand &MO); 65 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 66 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 67 MachineTraceMetrics::Trace BlockTrace); 68 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 69 MachineTraceMetrics::Trace BlockTrace); 70 bool 71 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 72 MachineTraceMetrics::Trace BlockTrace, 73 SmallVectorImpl<MachineInstr *> &InsInstrs, 74 SmallVectorImpl<MachineInstr *> &DelInstrs, 75 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 76 MachineCombinerPattern Pattern); 77 bool preservesResourceLen(MachineBasicBlock *MBB, 78 MachineTraceMetrics::Trace BlockTrace, 79 SmallVectorImpl<MachineInstr *> &InsInstrs, 80 SmallVectorImpl<MachineInstr *> &DelInstrs); 81 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 82 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 83 }; 84 } 85 86 char MachineCombiner::ID = 0; 87 char &llvm::MachineCombinerID = MachineCombiner::ID; 88 89 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 90 "Machine InstCombiner", false, false) 91 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 92 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 93 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 94 false, false) 95 96 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 97 AU.setPreservesCFG(); 98 AU.addPreserved<MachineDominatorTree>(); 99 AU.addRequired<MachineLoopInfo>(); 100 AU.addPreserved<MachineLoopInfo>(); 101 AU.addRequired<MachineTraceMetrics>(); 102 AU.addPreserved<MachineTraceMetrics>(); 103 MachineFunctionPass::getAnalysisUsage(AU); 104 } 105 106 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 107 MachineInstr *DefInstr = nullptr; 108 // We need a virtual register definition. 109 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 110 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 111 // PHI's have no depth etc. 112 if (DefInstr && DefInstr->isPHI()) 113 DefInstr = nullptr; 114 return DefInstr; 115 } 116 117 /// Computes depth of instructions in vector \InsInstr. 118 /// 119 /// \param InsInstrs is a vector of machine instructions 120 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 121 /// of defining machine instruction in \p InsInstrs 122 /// \param BlockTrace is a trace of machine instructions 123 /// 124 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 125 unsigned 126 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 127 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 128 MachineTraceMetrics::Trace BlockTrace) { 129 SmallVector<unsigned, 16> InstrDepth; 130 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 131 "Missing machine model\n"); 132 133 // For each instruction in the new sequence compute the depth based on the 134 // operands. Use the trace information when possible. For new operands which 135 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 136 for (auto *InstrPtr : InsInstrs) { // for each Use 137 unsigned IDepth = 0; 138 DEBUG(dbgs() << "NEW INSTR "; 139 InstrPtr->print(dbgs(), TII); 140 dbgs() << "\n";); 141 for (const MachineOperand &MO : InstrPtr->operands()) { 142 // Check for virtual register operand. 143 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 144 continue; 145 if (!MO.isUse()) 146 continue; 147 unsigned DepthOp = 0; 148 unsigned LatencyOp = 0; 149 DenseMap<unsigned, unsigned>::iterator II = 150 InstrIdxForVirtReg.find(MO.getReg()); 151 if (II != InstrIdxForVirtReg.end()) { 152 // Operand is new virtual register not in trace 153 assert(II->second < InstrDepth.size() && "Bad Index"); 154 MachineInstr *DefInstr = InsInstrs[II->second]; 155 assert(DefInstr && 156 "There must be a definition for a new virtual register"); 157 DepthOp = InstrDepth[II->second]; 158 LatencyOp = TSchedModel.computeOperandLatency( 159 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 160 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 161 } else { 162 MachineInstr *DefInstr = getOperandDef(MO); 163 if (DefInstr) { 164 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; 165 LatencyOp = TSchedModel.computeOperandLatency( 166 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 167 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 168 } 169 } 170 IDepth = std::max(IDepth, DepthOp + LatencyOp); 171 } 172 InstrDepth.push_back(IDepth); 173 } 174 unsigned NewRootIdx = InsInstrs.size() - 1; 175 return InstrDepth[NewRootIdx]; 176 } 177 178 /// Computes instruction latency as max of latency of defined operands. 179 /// 180 /// \param Root is a machine instruction that could be replaced by NewRoot. 181 /// It is used to compute a more accurate latency information for NewRoot in 182 /// case there is a dependent instruction in the same trace (\p BlockTrace) 183 /// \param NewRoot is the instruction for which the latency is computed 184 /// \param BlockTrace is a trace of machine instructions 185 /// 186 /// \returns Latency of \p NewRoot 187 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 188 MachineTraceMetrics::Trace BlockTrace) { 189 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 190 "Missing machine model\n"); 191 192 // Check each definition in NewRoot and compute the latency 193 unsigned NewRootLatency = 0; 194 195 for (const MachineOperand &MO : NewRoot->operands()) { 196 // Check for virtual register operand. 197 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 198 continue; 199 if (!MO.isDef()) 200 continue; 201 // Get the first instruction that uses MO 202 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 203 RI++; 204 MachineInstr *UseMO = RI->getParent(); 205 unsigned LatencyOp = 0; 206 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { 207 LatencyOp = TSchedModel.computeOperandLatency( 208 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 209 UseMO->findRegisterUseOperandIdx(MO.getReg())); 210 } else { 211 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 212 } 213 NewRootLatency = std::max(NewRootLatency, LatencyOp); 214 } 215 return NewRootLatency; 216 } 217 218 /// The combiner's goal may differ based on which pattern it is attempting 219 /// to optimize. 220 enum class CombinerObjective { 221 MustReduceDepth, // The data dependency chain must be improved. 222 Default // The critical path must not be lengthened. 223 }; 224 225 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 226 // TODO: If C++ ever gets a real enum class, make this part of the 227 // MachineCombinerPattern class. 228 switch (P) { 229 case MachineCombinerPattern::REASSOC_AX_BY: 230 case MachineCombinerPattern::REASSOC_AX_YB: 231 case MachineCombinerPattern::REASSOC_XA_BY: 232 case MachineCombinerPattern::REASSOC_XA_YB: 233 return CombinerObjective::MustReduceDepth; 234 default: 235 return CombinerObjective::Default; 236 } 237 } 238 239 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 240 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 241 /// sequence to replace the old sequence is that it cannot lengthen the critical 242 /// path. The definition of "improve" may be restricted by specifying that the 243 /// new path improves the data dependency chain (MustReduceDepth). 244 bool MachineCombiner::improvesCriticalPathLen( 245 MachineBasicBlock *MBB, MachineInstr *Root, 246 MachineTraceMetrics::Trace BlockTrace, 247 SmallVectorImpl<MachineInstr *> &InsInstrs, 248 SmallVectorImpl<MachineInstr *> &DelInstrs, 249 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 250 MachineCombinerPattern Pattern) { 251 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 252 "Missing machine model\n"); 253 // NewRoot is the last instruction in the \p InsInstrs vector. 254 unsigned NewRootIdx = InsInstrs.size() - 1; 255 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 256 257 // Get depth and latency of NewRoot and Root. 258 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 259 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; 260 261 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 262 dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; 263 dbgs() << " RootDepth: " << RootDepth << "\n"); 264 265 // For a transform such as reassociation, the cost equation is 266 // conservatively calculated so that we must improve the depth (data 267 // dependency cycles) in the critical path to proceed with the transform. 268 // Being conservative also protects against inaccuracies in the underlying 269 // machine trace metrics and CPU models. 270 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) 271 return NewRootDepth < RootDepth; 272 273 // A more flexible cost calculation for the critical path includes the slack 274 // of the original code sequence. This may allow the transform to proceed 275 // even if the instruction depths (data dependency cycles) become worse. 276 277 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 278 unsigned RootLatency = 0; 279 280 for (auto I : DelInstrs) 281 RootLatency += TSchedModel.computeInstrLatency(I); 282 283 unsigned RootSlack = BlockTrace.getInstrSlack(*Root); 284 285 DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; 286 dbgs() << " RootLatency: " << RootLatency << "\n"; 287 dbgs() << " RootSlack: " << RootSlack << "\n"; 288 dbgs() << " NewRootDepth + NewRootLatency = " 289 << NewRootDepth + NewRootLatency << "\n"; 290 dbgs() << " RootDepth + RootLatency + RootSlack = " 291 << RootDepth + RootLatency + RootSlack << "\n";); 292 293 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 294 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 295 296 return NewCycleCount <= OldCycleCount; 297 } 298 299 /// helper routine to convert instructions into SC 300 void MachineCombiner::instr2instrSC( 301 SmallVectorImpl<MachineInstr *> &Instrs, 302 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 303 for (auto *InstrPtr : Instrs) { 304 unsigned Opc = InstrPtr->getOpcode(); 305 unsigned Idx = TII->get(Opc).getSchedClass(); 306 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 307 InstrsSC.push_back(SC); 308 } 309 } 310 311 /// True when the new instructions do not increase resource length 312 bool MachineCombiner::preservesResourceLen( 313 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 314 SmallVectorImpl<MachineInstr *> &InsInstrs, 315 SmallVectorImpl<MachineInstr *> &DelInstrs) { 316 if (!TSchedModel.hasInstrSchedModel()) 317 return true; 318 319 // Compute current resource length 320 321 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 322 SmallVector <const MachineBasicBlock *, 1> MBBarr; 323 MBBarr.push_back(MBB); 324 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 325 326 // Deal with SC rather than Instructions. 327 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 328 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 329 330 instr2instrSC(InsInstrs, InsInstrsSC); 331 instr2instrSC(DelInstrs, DelInstrsSC); 332 333 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 334 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 335 336 // Compute new resource length. 337 unsigned ResLenAfterCombine = 338 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 339 340 DEBUG(dbgs() << "RESOURCE DATA: \n"; 341 dbgs() << " resource len before: " << ResLenBeforeCombine 342 << " after: " << ResLenAfterCombine << "\n";); 343 344 return ResLenAfterCombine <= ResLenBeforeCombine; 345 } 346 347 /// \returns true when new instruction sequence should be generated 348 /// independent if it lengthens critical path or not 349 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 350 if (OptSize && (NewSize < OldSize)) 351 return true; 352 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 353 return true; 354 return false; 355 } 356 357 /// Substitute a slow code sequence with a faster one by 358 /// evaluating instruction combining pattern. 359 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 360 /// combining based on machine trace metrics. Only combine a sequence of 361 /// instructions when this neither lengthens the critical path nor increases 362 /// resource pressure. When optimizing for codesize always combine when the new 363 /// sequence is shorter. 364 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 365 bool Changed = false; 366 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 367 368 auto BlockIter = MBB->begin(); 369 // Check if the block is in a loop. 370 const MachineLoop *ML = MLI->getLoopFor(MBB); 371 372 while (BlockIter != MBB->end()) { 373 auto &MI = *BlockIter++; 374 375 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 376 SmallVector<MachineCombinerPattern, 16> Patterns; 377 // The motivating example is: 378 // 379 // MUL Other MUL_op1 MUL_op2 Other 380 // \ / \ | / 381 // ADD/SUB => MADD/MSUB 382 // (=Root) (=NewRoot) 383 384 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 385 // usually beneficial for code size it unfortunately can hurt performance 386 // when the ADD is on the critical path, but the MUL is not. With the 387 // substitution the MUL becomes part of the critical path (in form of the 388 // MADD) and can lengthen it on architectures where the MADD latency is 389 // longer than the ADD latency. 390 // 391 // For each instruction we check if it can be the root of a combiner 392 // pattern. Then for each pattern the new code sequence in form of MI is 393 // generated and evaluated. When the efficiency criteria (don't lengthen 394 // critical path, don't use more resources) is met the new sequence gets 395 // hooked up into the basic block before the old sequence is removed. 396 // 397 // The algorithm does not try to evaluate all patterns and pick the best. 398 // This is only an artificial restriction though. In practice there is 399 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 400 // based on an internal cost heuristic. 401 402 if (!TII->getMachineCombinerPatterns(MI, Patterns)) 403 continue; 404 405 for (auto P : Patterns) { 406 SmallVector<MachineInstr *, 16> InsInstrs; 407 SmallVector<MachineInstr *, 16> DelInstrs; 408 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 409 if (!MinInstr) 410 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 411 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 412 Traces->verifyAnalysis(); 413 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 414 InstrIdxForVirtReg); 415 unsigned NewInstCount = InsInstrs.size(); 416 unsigned OldInstCount = DelInstrs.size(); 417 // Found pattern, but did not generate alternative sequence. 418 // This can happen e.g. when an immediate could not be materialized 419 // in a single instruction. 420 if (!NewInstCount) 421 continue; 422 423 bool SubstituteAlways = false; 424 if (ML && TII->isThroughputPattern(P)) 425 SubstituteAlways = true; 426 427 // Substitute when we optimize for codesize and the new sequence has 428 // fewer instructions OR 429 // the new sequence neither lengthens the critical path nor increases 430 // resource pressure. 431 if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || 432 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 433 DelInstrs, InstrIdxForVirtReg, P) && 434 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 435 for (auto *InstrPtr : InsInstrs) 436 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 437 for (auto *InstrPtr : DelInstrs) 438 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 439 440 Changed = true; 441 ++NumInstCombined; 442 443 Traces->invalidate(MBB); 444 Traces->verifyAnalysis(); 445 // Eagerly stop after the first pattern fires. 446 break; 447 } else { 448 // Cleanup instructions of the alternative code sequence. There is no 449 // use for them. 450 MachineFunction *MF = MBB->getParent(); 451 for (auto *InstrPtr : InsInstrs) 452 MF->DeleteMachineInstr(InstrPtr); 453 } 454 InstrIdxForVirtReg.clear(); 455 } 456 } 457 458 return Changed; 459 } 460 461 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 462 const TargetSubtargetInfo &STI = MF.getSubtarget(); 463 TII = STI.getInstrInfo(); 464 TRI = STI.getRegisterInfo(); 465 SchedModel = STI.getSchedModel(); 466 TSchedModel.init(SchedModel, &STI, TII); 467 MRI = &MF.getRegInfo(); 468 MLI = &getAnalysis<MachineLoopInfo>(); 469 Traces = &getAnalysis<MachineTraceMetrics>(); 470 MinInstr = nullptr; 471 OptSize = MF.getFunction()->optForSize(); 472 473 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 474 if (!TII->useMachineCombiner()) { 475 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 476 return false; 477 } 478 479 bool Changed = false; 480 481 // Try to combine instructions. 482 for (auto &MBB : MF) 483 Changed |= combineInstructions(&MBB); 484 485 return Changed; 486 } 487