1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 14 #define DEBUG_TYPE "machine-combiner" 15 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/CodeGen/MachineDominators.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/MachineFunctionPass.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineLoopInfo.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/MachineTraceMetrics.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/TargetSchedule.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineLoopInfo *MLI; // Current MachineLoopInfo 44 MachineTraceMetrics *Traces; 45 MachineTraceMetrics::Ensemble *MinInstr; 46 47 TargetSchedModel TSchedModel; 48 49 /// True if optimizing for code size. 50 bool OptSize; 51 52 public: 53 static char ID; 54 MachineCombiner() : MachineFunctionPass(ID) { 55 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 56 } 57 void getAnalysisUsage(AnalysisUsage &AU) const override; 58 bool runOnMachineFunction(MachineFunction &MF) override; 59 StringRef getPassName() const override { return "Machine InstCombiner"; } 60 61 private: 62 bool doSubstitute(unsigned NewSize, unsigned OldSize); 63 bool combineInstructions(MachineBasicBlock *); 64 MachineInstr *getOperandDef(const MachineOperand &MO); 65 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 66 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 67 MachineTraceMetrics::Trace BlockTrace); 68 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 69 MachineTraceMetrics::Trace BlockTrace); 70 bool 71 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 72 MachineTraceMetrics::Trace BlockTrace, 73 SmallVectorImpl<MachineInstr *> &InsInstrs, 74 SmallVectorImpl<MachineInstr *> &DelInstrs, 75 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 76 MachineCombinerPattern Pattern); 77 bool preservesResourceLen(MachineBasicBlock *MBB, 78 MachineTraceMetrics::Trace BlockTrace, 79 SmallVectorImpl<MachineInstr *> &InsInstrs, 80 SmallVectorImpl<MachineInstr *> &DelInstrs); 81 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 82 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 83 }; 84 } 85 86 char MachineCombiner::ID = 0; 87 char &llvm::MachineCombinerID = MachineCombiner::ID; 88 89 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 90 "Machine InstCombiner", false, false) 91 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 92 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 93 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 94 false, false) 95 96 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 97 AU.setPreservesCFG(); 98 AU.addPreserved<MachineDominatorTree>(); 99 AU.addRequired<MachineLoopInfo>(); 100 AU.addPreserved<MachineLoopInfo>(); 101 AU.addRequired<MachineTraceMetrics>(); 102 AU.addPreserved<MachineTraceMetrics>(); 103 MachineFunctionPass::getAnalysisUsage(AU); 104 } 105 106 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 107 MachineInstr *DefInstr = nullptr; 108 // We need a virtual register definition. 109 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 110 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 111 // PHI's have no depth etc. 112 if (DefInstr && DefInstr->isPHI()) 113 DefInstr = nullptr; 114 return DefInstr; 115 } 116 117 /// Computes depth of instructions in vector \InsInstr. 118 /// 119 /// \param InsInstrs is a vector of machine instructions 120 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 121 /// of defining machine instruction in \p InsInstrs 122 /// \param BlockTrace is a trace of machine instructions 123 /// 124 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 125 unsigned 126 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 127 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 128 MachineTraceMetrics::Trace BlockTrace) { 129 SmallVector<unsigned, 16> InstrDepth; 130 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 131 "Missing machine model\n"); 132 133 // For each instruction in the new sequence compute the depth based on the 134 // operands. Use the trace information when possible. For new operands which 135 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 136 for (auto *InstrPtr : InsInstrs) { // for each Use 137 unsigned IDepth = 0; 138 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";); 139 for (const MachineOperand &MO : InstrPtr->operands()) { 140 // Check for virtual register operand. 141 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 142 continue; 143 if (!MO.isUse()) 144 continue; 145 unsigned DepthOp = 0; 146 unsigned LatencyOp = 0; 147 DenseMap<unsigned, unsigned>::iterator II = 148 InstrIdxForVirtReg.find(MO.getReg()); 149 if (II != InstrIdxForVirtReg.end()) { 150 // Operand is new virtual register not in trace 151 assert(II->second < InstrDepth.size() && "Bad Index"); 152 MachineInstr *DefInstr = InsInstrs[II->second]; 153 assert(DefInstr && 154 "There must be a definition for a new virtual register"); 155 DepthOp = InstrDepth[II->second]; 156 LatencyOp = TSchedModel.computeOperandLatency( 157 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 158 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 159 } else { 160 MachineInstr *DefInstr = getOperandDef(MO); 161 if (DefInstr) { 162 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; 163 LatencyOp = TSchedModel.computeOperandLatency( 164 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 165 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 166 } 167 } 168 IDepth = std::max(IDepth, DepthOp + LatencyOp); 169 } 170 InstrDepth.push_back(IDepth); 171 } 172 unsigned NewRootIdx = InsInstrs.size() - 1; 173 return InstrDepth[NewRootIdx]; 174 } 175 176 /// Computes instruction latency as max of latency of defined operands. 177 /// 178 /// \param Root is a machine instruction that could be replaced by NewRoot. 179 /// It is used to compute a more accurate latency information for NewRoot in 180 /// case there is a dependent instruction in the same trace (\p BlockTrace) 181 /// \param NewRoot is the instruction for which the latency is computed 182 /// \param BlockTrace is a trace of machine instructions 183 /// 184 /// \returns Latency of \p NewRoot 185 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 186 MachineTraceMetrics::Trace BlockTrace) { 187 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 188 "Missing machine model\n"); 189 190 // Check each definition in NewRoot and compute the latency 191 unsigned NewRootLatency = 0; 192 193 for (const MachineOperand &MO : NewRoot->operands()) { 194 // Check for virtual register operand. 195 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 196 continue; 197 if (!MO.isDef()) 198 continue; 199 // Get the first instruction that uses MO 200 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 201 RI++; 202 MachineInstr *UseMO = RI->getParent(); 203 unsigned LatencyOp = 0; 204 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { 205 LatencyOp = TSchedModel.computeOperandLatency( 206 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 207 UseMO->findRegisterUseOperandIdx(MO.getReg())); 208 } else { 209 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 210 } 211 NewRootLatency = std::max(NewRootLatency, LatencyOp); 212 } 213 return NewRootLatency; 214 } 215 216 /// The combiner's goal may differ based on which pattern it is attempting 217 /// to optimize. 218 enum class CombinerObjective { 219 MustReduceDepth, // The data dependency chain must be improved. 220 Default // The critical path must not be lengthened. 221 }; 222 223 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 224 // TODO: If C++ ever gets a real enum class, make this part of the 225 // MachineCombinerPattern class. 226 switch (P) { 227 case MachineCombinerPattern::REASSOC_AX_BY: 228 case MachineCombinerPattern::REASSOC_AX_YB: 229 case MachineCombinerPattern::REASSOC_XA_BY: 230 case MachineCombinerPattern::REASSOC_XA_YB: 231 return CombinerObjective::MustReduceDepth; 232 default: 233 return CombinerObjective::Default; 234 } 235 } 236 237 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 238 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 239 /// sequence to replace the old sequence is that it cannot lengthen the critical 240 /// path. The definition of "improve" may be restricted by specifying that the 241 /// new path improves the data dependency chain (MustReduceDepth). 242 bool MachineCombiner::improvesCriticalPathLen( 243 MachineBasicBlock *MBB, MachineInstr *Root, 244 MachineTraceMetrics::Trace BlockTrace, 245 SmallVectorImpl<MachineInstr *> &InsInstrs, 246 SmallVectorImpl<MachineInstr *> &DelInstrs, 247 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 248 MachineCombinerPattern Pattern) { 249 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 250 "Missing machine model\n"); 251 // NewRoot is the last instruction in the \p InsInstrs vector. 252 unsigned NewRootIdx = InsInstrs.size() - 1; 253 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 254 255 // Get depth and latency of NewRoot and Root. 256 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 257 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; 258 259 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 260 dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; 261 dbgs() << " RootDepth: " << RootDepth << "\n"); 262 263 // For a transform such as reassociation, the cost equation is 264 // conservatively calculated so that we must improve the depth (data 265 // dependency cycles) in the critical path to proceed with the transform. 266 // Being conservative also protects against inaccuracies in the underlying 267 // machine trace metrics and CPU models. 268 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) 269 return NewRootDepth < RootDepth; 270 271 // A more flexible cost calculation for the critical path includes the slack 272 // of the original code sequence. This may allow the transform to proceed 273 // even if the instruction depths (data dependency cycles) become worse. 274 275 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 276 unsigned RootLatency = 0; 277 278 for (auto I : DelInstrs) 279 RootLatency += TSchedModel.computeInstrLatency(I); 280 281 unsigned RootSlack = BlockTrace.getInstrSlack(*Root); 282 283 DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; 284 dbgs() << " RootLatency: " << RootLatency << "\n"; 285 dbgs() << " RootSlack: " << RootSlack << "\n"; 286 dbgs() << " NewRootDepth + NewRootLatency = " 287 << NewRootDepth + NewRootLatency << "\n"; 288 dbgs() << " RootDepth + RootLatency + RootSlack = " 289 << RootDepth + RootLatency + RootSlack << "\n";); 290 291 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 292 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 293 294 return NewCycleCount <= OldCycleCount; 295 } 296 297 /// helper routine to convert instructions into SC 298 void MachineCombiner::instr2instrSC( 299 SmallVectorImpl<MachineInstr *> &Instrs, 300 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 301 for (auto *InstrPtr : Instrs) { 302 unsigned Opc = InstrPtr->getOpcode(); 303 unsigned Idx = TII->get(Opc).getSchedClass(); 304 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 305 InstrsSC.push_back(SC); 306 } 307 } 308 309 /// True when the new instructions do not increase resource length 310 bool MachineCombiner::preservesResourceLen( 311 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 312 SmallVectorImpl<MachineInstr *> &InsInstrs, 313 SmallVectorImpl<MachineInstr *> &DelInstrs) { 314 if (!TSchedModel.hasInstrSchedModel()) 315 return true; 316 317 // Compute current resource length 318 319 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 320 SmallVector <const MachineBasicBlock *, 1> MBBarr; 321 MBBarr.push_back(MBB); 322 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 323 324 // Deal with SC rather than Instructions. 325 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 326 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 327 328 instr2instrSC(InsInstrs, InsInstrsSC); 329 instr2instrSC(DelInstrs, DelInstrsSC); 330 331 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 332 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 333 334 // Compute new resource length. 335 unsigned ResLenAfterCombine = 336 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 337 338 DEBUG(dbgs() << "RESOURCE DATA: \n"; 339 dbgs() << " resource len before: " << ResLenBeforeCombine 340 << " after: " << ResLenAfterCombine << "\n";); 341 342 return ResLenAfterCombine <= ResLenBeforeCombine; 343 } 344 345 /// \returns true when new instruction sequence should be generated 346 /// independent if it lengthens critical path or not 347 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 348 if (OptSize && (NewSize < OldSize)) 349 return true; 350 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 351 return true; 352 return false; 353 } 354 355 /// Substitute a slow code sequence with a faster one by 356 /// evaluating instruction combining pattern. 357 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 358 /// combining based on machine trace metrics. Only combine a sequence of 359 /// instructions when this neither lengthens the critical path nor increases 360 /// resource pressure. When optimizing for codesize always combine when the new 361 /// sequence is shorter. 362 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 363 bool Changed = false; 364 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 365 366 auto BlockIter = MBB->begin(); 367 // Check if the block is in a loop. 368 const MachineLoop *ML = MLI->getLoopFor(MBB); 369 370 while (BlockIter != MBB->end()) { 371 auto &MI = *BlockIter++; 372 373 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 374 SmallVector<MachineCombinerPattern, 16> Patterns; 375 // The motivating example is: 376 // 377 // MUL Other MUL_op1 MUL_op2 Other 378 // \ / \ | / 379 // ADD/SUB => MADD/MSUB 380 // (=Root) (=NewRoot) 381 382 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 383 // usually beneficial for code size it unfortunately can hurt performance 384 // when the ADD is on the critical path, but the MUL is not. With the 385 // substitution the MUL becomes part of the critical path (in form of the 386 // MADD) and can lengthen it on architectures where the MADD latency is 387 // longer than the ADD latency. 388 // 389 // For each instruction we check if it can be the root of a combiner 390 // pattern. Then for each pattern the new code sequence in form of MI is 391 // generated and evaluated. When the efficiency criteria (don't lengthen 392 // critical path, don't use more resources) is met the new sequence gets 393 // hooked up into the basic block before the old sequence is removed. 394 // 395 // The algorithm does not try to evaluate all patterns and pick the best. 396 // This is only an artificial restriction though. In practice there is 397 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 398 // based on an internal cost heuristic. 399 400 if (!TII->getMachineCombinerPatterns(MI, Patterns)) 401 continue; 402 403 for (auto P : Patterns) { 404 SmallVector<MachineInstr *, 16> InsInstrs; 405 SmallVector<MachineInstr *, 16> DelInstrs; 406 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 407 if (!MinInstr) 408 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 409 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 410 Traces->verifyAnalysis(); 411 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 412 InstrIdxForVirtReg); 413 unsigned NewInstCount = InsInstrs.size(); 414 unsigned OldInstCount = DelInstrs.size(); 415 // Found pattern, but did not generate alternative sequence. 416 // This can happen e.g. when an immediate could not be materialized 417 // in a single instruction. 418 if (!NewInstCount) 419 continue; 420 421 bool SubstituteAlways = false; 422 if (ML && TII->isThroughputPattern(P)) 423 SubstituteAlways = true; 424 425 // Substitute when we optimize for codesize and the new sequence has 426 // fewer instructions OR 427 // the new sequence neither lengthens the critical path nor increases 428 // resource pressure. 429 if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || 430 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 431 DelInstrs, InstrIdxForVirtReg, P) && 432 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 433 for (auto *InstrPtr : InsInstrs) 434 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 435 for (auto *InstrPtr : DelInstrs) 436 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 437 438 Changed = true; 439 ++NumInstCombined; 440 441 Traces->invalidate(MBB); 442 Traces->verifyAnalysis(); 443 // Eagerly stop after the first pattern fires. 444 break; 445 } else { 446 // Cleanup instructions of the alternative code sequence. There is no 447 // use for them. 448 MachineFunction *MF = MBB->getParent(); 449 for (auto *InstrPtr : InsInstrs) 450 MF->DeleteMachineInstr(InstrPtr); 451 } 452 InstrIdxForVirtReg.clear(); 453 } 454 } 455 456 return Changed; 457 } 458 459 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 460 const TargetSubtargetInfo &STI = MF.getSubtarget(); 461 TII = STI.getInstrInfo(); 462 TRI = STI.getRegisterInfo(); 463 SchedModel = STI.getSchedModel(); 464 TSchedModel.init(SchedModel, &STI, TII); 465 MRI = &MF.getRegInfo(); 466 MLI = &getAnalysis<MachineLoopInfo>(); 467 Traces = &getAnalysis<MachineTraceMetrics>(); 468 MinInstr = nullptr; 469 OptSize = MF.getFunction()->optForSize(); 470 471 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 472 if (!TII->useMachineCombiner()) { 473 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 474 return false; 475 } 476 477 bool Changed = false; 478 479 // Try to combine instructions. 480 for (auto &MBB : MF) 481 Changed |= combineInstructions(&MBB); 482 483 return Changed; 484 } 485