1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 14 #define DEBUG_TYPE "machine-combiner" 15 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/CodeGen/MachineDominators.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/MachineFunctionPass.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineLoopInfo.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/MachineTraceMetrics.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/TargetSchedule.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineTraceMetrics *Traces; 44 MachineTraceMetrics::Ensemble *MinInstr; 45 46 TargetSchedModel TSchedModel; 47 48 /// True if optimizing for code size. 49 bool OptSize; 50 51 public: 52 static char ID; 53 MachineCombiner() : MachineFunctionPass(ID) { 54 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 55 } 56 void getAnalysisUsage(AnalysisUsage &AU) const override; 57 bool runOnMachineFunction(MachineFunction &MF) override; 58 const char *getPassName() const override { return "Machine InstCombiner"; } 59 60 private: 61 bool doSubstitute(unsigned NewSize, unsigned OldSize); 62 bool combineInstructions(MachineBasicBlock *); 63 MachineInstr *getOperandDef(const MachineOperand &MO); 64 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 65 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 66 MachineTraceMetrics::Trace BlockTrace); 67 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 68 MachineTraceMetrics::Trace BlockTrace); 69 bool 70 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 71 MachineTraceMetrics::Trace BlockTrace, 72 SmallVectorImpl<MachineInstr *> &InsInstrs, 73 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 74 MachineCombinerPattern Pattern); 75 bool preservesResourceLen(MachineBasicBlock *MBB, 76 MachineTraceMetrics::Trace BlockTrace, 77 SmallVectorImpl<MachineInstr *> &InsInstrs, 78 SmallVectorImpl<MachineInstr *> &DelInstrs); 79 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 80 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 81 }; 82 } 83 84 char MachineCombiner::ID = 0; 85 char &llvm::MachineCombinerID = MachineCombiner::ID; 86 87 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 88 "Machine InstCombiner", false, false) 89 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 90 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 91 false, false) 92 93 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 94 AU.setPreservesCFG(); 95 AU.addPreserved<MachineDominatorTree>(); 96 AU.addPreserved<MachineLoopInfo>(); 97 AU.addRequired<MachineTraceMetrics>(); 98 AU.addPreserved<MachineTraceMetrics>(); 99 MachineFunctionPass::getAnalysisUsage(AU); 100 } 101 102 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 103 MachineInstr *DefInstr = nullptr; 104 // We need a virtual register definition. 105 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 106 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 107 // PHI's have no depth etc. 108 if (DefInstr && DefInstr->isPHI()) 109 DefInstr = nullptr; 110 return DefInstr; 111 } 112 113 /// Computes depth of instructions in vector \InsInstr. 114 /// 115 /// \param InsInstrs is a vector of machine instructions 116 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 117 /// of defining machine instruction in \p InsInstrs 118 /// \param BlockTrace is a trace of machine instructions 119 /// 120 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 121 unsigned 122 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 123 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 124 MachineTraceMetrics::Trace BlockTrace) { 125 SmallVector<unsigned, 16> InstrDepth; 126 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 127 "Missing machine model\n"); 128 129 // For each instruction in the new sequence compute the depth based on the 130 // operands. Use the trace information when possible. For new operands which 131 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 132 for (auto *InstrPtr : InsInstrs) { // for each Use 133 unsigned IDepth = 0; 134 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); 135 for (const MachineOperand &MO : InstrPtr->operands()) { 136 // Check for virtual register operand. 137 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 138 continue; 139 if (!MO.isUse()) 140 continue; 141 unsigned DepthOp = 0; 142 unsigned LatencyOp = 0; 143 DenseMap<unsigned, unsigned>::iterator II = 144 InstrIdxForVirtReg.find(MO.getReg()); 145 if (II != InstrIdxForVirtReg.end()) { 146 // Operand is new virtual register not in trace 147 assert(II->second < InstrDepth.size() && "Bad Index"); 148 MachineInstr *DefInstr = InsInstrs[II->second]; 149 assert(DefInstr && 150 "There must be a definition for a new virtual register"); 151 DepthOp = InstrDepth[II->second]; 152 LatencyOp = TSchedModel.computeOperandLatency( 153 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 154 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 155 } else { 156 MachineInstr *DefInstr = getOperandDef(MO); 157 if (DefInstr) { 158 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; 159 LatencyOp = TSchedModel.computeOperandLatency( 160 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 161 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 162 } 163 } 164 IDepth = std::max(IDepth, DepthOp + LatencyOp); 165 } 166 InstrDepth.push_back(IDepth); 167 } 168 unsigned NewRootIdx = InsInstrs.size() - 1; 169 return InstrDepth[NewRootIdx]; 170 } 171 172 /// Computes instruction latency as max of latency of defined operands. 173 /// 174 /// \param Root is a machine instruction that could be replaced by NewRoot. 175 /// It is used to compute a more accurate latency information for NewRoot in 176 /// case there is a dependent instruction in the same trace (\p BlockTrace) 177 /// \param NewRoot is the instruction for which the latency is computed 178 /// \param BlockTrace is a trace of machine instructions 179 /// 180 /// \returns Latency of \p NewRoot 181 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 182 MachineTraceMetrics::Trace BlockTrace) { 183 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 184 "Missing machine model\n"); 185 186 // Check each definition in NewRoot and compute the latency 187 unsigned NewRootLatency = 0; 188 189 for (const MachineOperand &MO : NewRoot->operands()) { 190 // Check for virtual register operand. 191 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 192 continue; 193 if (!MO.isDef()) 194 continue; 195 // Get the first instruction that uses MO 196 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 197 RI++; 198 MachineInstr *UseMO = RI->getParent(); 199 unsigned LatencyOp = 0; 200 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { 201 LatencyOp = TSchedModel.computeOperandLatency( 202 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 203 UseMO->findRegisterUseOperandIdx(MO.getReg())); 204 } else { 205 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 206 } 207 NewRootLatency = std::max(NewRootLatency, LatencyOp); 208 } 209 return NewRootLatency; 210 } 211 212 /// The combiner's goal may differ based on which pattern it is attempting 213 /// to optimize. 214 enum class CombinerObjective { 215 MustReduceDepth, // The data dependency chain must be improved. 216 Default // The critical path must not be lengthened. 217 }; 218 219 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 220 // TODO: If C++ ever gets a real enum class, make this part of the 221 // MachineCombinerPattern class. 222 switch (P) { 223 case MachineCombinerPattern::REASSOC_AX_BY: 224 case MachineCombinerPattern::REASSOC_AX_YB: 225 case MachineCombinerPattern::REASSOC_XA_BY: 226 case MachineCombinerPattern::REASSOC_XA_YB: 227 return CombinerObjective::MustReduceDepth; 228 default: 229 return CombinerObjective::Default; 230 } 231 } 232 233 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 234 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 235 /// sequence to replace the old sequence is that it cannot lengthen the critical 236 /// path. The definition of "improve" may be restricted by specifying that the 237 /// new path improves the data dependency chain (MustReduceDepth). 238 bool MachineCombiner::improvesCriticalPathLen( 239 MachineBasicBlock *MBB, MachineInstr *Root, 240 MachineTraceMetrics::Trace BlockTrace, 241 SmallVectorImpl<MachineInstr *> &InsInstrs, 242 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 243 MachineCombinerPattern Pattern) { 244 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 245 "Missing machine model\n"); 246 // NewRoot is the last instruction in the \p InsInstrs vector. 247 unsigned NewRootIdx = InsInstrs.size() - 1; 248 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 249 250 // Get depth and latency of NewRoot and Root. 251 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 252 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; 253 254 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 255 dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; 256 dbgs() << " RootDepth: " << RootDepth << "\n"); 257 258 // For a transform such as reassociation, the cost equation is 259 // conservatively calculated so that we must improve the depth (data 260 // dependency cycles) in the critical path to proceed with the transform. 261 // Being conservative also protects against inaccuracies in the underlying 262 // machine trace metrics and CPU models. 263 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) 264 return NewRootDepth < RootDepth; 265 266 // A more flexible cost calculation for the critical path includes the slack 267 // of the original code sequence. This may allow the transform to proceed 268 // even if the instruction depths (data dependency cycles) become worse. 269 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 270 unsigned RootLatency = TSchedModel.computeInstrLatency(Root); 271 unsigned RootSlack = BlockTrace.getInstrSlack(*Root); 272 273 DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; 274 dbgs() << " RootLatency: " << RootLatency << "\n"; 275 dbgs() << " RootSlack: " << RootSlack << "\n"; 276 dbgs() << " NewRootDepth + NewRootLatency = " 277 << NewRootDepth + NewRootLatency << "\n"; 278 dbgs() << " RootDepth + RootLatency + RootSlack = " 279 << RootDepth + RootLatency + RootSlack << "\n";); 280 281 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 282 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 283 284 return NewCycleCount <= OldCycleCount; 285 } 286 287 /// helper routine to convert instructions into SC 288 void MachineCombiner::instr2instrSC( 289 SmallVectorImpl<MachineInstr *> &Instrs, 290 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 291 for (auto *InstrPtr : Instrs) { 292 unsigned Opc = InstrPtr->getOpcode(); 293 unsigned Idx = TII->get(Opc).getSchedClass(); 294 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 295 InstrsSC.push_back(SC); 296 } 297 } 298 299 /// True when the new instructions do not increase resource length 300 bool MachineCombiner::preservesResourceLen( 301 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 302 SmallVectorImpl<MachineInstr *> &InsInstrs, 303 SmallVectorImpl<MachineInstr *> &DelInstrs) { 304 if (!TSchedModel.hasInstrSchedModel()) 305 return true; 306 307 // Compute current resource length 308 309 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 310 SmallVector <const MachineBasicBlock *, 1> MBBarr; 311 MBBarr.push_back(MBB); 312 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 313 314 // Deal with SC rather than Instructions. 315 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 316 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 317 318 instr2instrSC(InsInstrs, InsInstrsSC); 319 instr2instrSC(DelInstrs, DelInstrsSC); 320 321 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 322 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 323 324 // Compute new resource length. 325 unsigned ResLenAfterCombine = 326 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 327 328 DEBUG(dbgs() << "RESOURCE DATA: \n"; 329 dbgs() << " resource len before: " << ResLenBeforeCombine 330 << " after: " << ResLenAfterCombine << "\n";); 331 332 return ResLenAfterCombine <= ResLenBeforeCombine; 333 } 334 335 /// \returns true when new instruction sequence should be generated 336 /// independent if it lengthens critical path or not 337 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 338 if (OptSize && (NewSize < OldSize)) 339 return true; 340 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 341 return true; 342 return false; 343 } 344 345 /// Substitute a slow code sequence with a faster one by 346 /// evaluating instruction combining pattern. 347 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 348 /// combining based on machine trace metrics. Only combine a sequence of 349 /// instructions when this neither lengthens the critical path nor increases 350 /// resource pressure. When optimizing for codesize always combine when the new 351 /// sequence is shorter. 352 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 353 bool Changed = false; 354 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 355 356 auto BlockIter = MBB->begin(); 357 358 while (BlockIter != MBB->end()) { 359 auto &MI = *BlockIter++; 360 361 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 362 SmallVector<MachineCombinerPattern, 16> Patterns; 363 // The motivating example is: 364 // 365 // MUL Other MUL_op1 MUL_op2 Other 366 // \ / \ | / 367 // ADD/SUB => MADD/MSUB 368 // (=Root) (=NewRoot) 369 370 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 371 // usually beneficial for code size it unfortunately can hurt performance 372 // when the ADD is on the critical path, but the MUL is not. With the 373 // substitution the MUL becomes part of the critical path (in form of the 374 // MADD) and can lengthen it on architectures where the MADD latency is 375 // longer than the ADD latency. 376 // 377 // For each instruction we check if it can be the root of a combiner 378 // pattern. Then for each pattern the new code sequence in form of MI is 379 // generated and evaluated. When the efficiency criteria (don't lengthen 380 // critical path, don't use more resources) is met the new sequence gets 381 // hooked up into the basic block before the old sequence is removed. 382 // 383 // The algorithm does not try to evaluate all patterns and pick the best. 384 // This is only an artificial restriction though. In practice there is 385 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 386 // based on an internal cost heuristic. 387 388 if (!TII->getMachineCombinerPatterns(MI, Patterns)) 389 continue; 390 391 for (auto P : Patterns) { 392 SmallVector<MachineInstr *, 16> InsInstrs; 393 SmallVector<MachineInstr *, 16> DelInstrs; 394 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 395 if (!MinInstr) 396 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 397 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 398 Traces->verifyAnalysis(); 399 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 400 InstrIdxForVirtReg); 401 unsigned NewInstCount = InsInstrs.size(); 402 unsigned OldInstCount = DelInstrs.size(); 403 // Found pattern, but did not generate alternative sequence. 404 // This can happen e.g. when an immediate could not be materialized 405 // in a single instruction. 406 if (!NewInstCount) 407 continue; 408 409 // Substitute when we optimize for codesize and the new sequence has 410 // fewer instructions OR 411 // the new sequence neither lengthens the critical path nor increases 412 // resource pressure. 413 if (doSubstitute(NewInstCount, OldInstCount) || 414 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 415 InstrIdxForVirtReg, P) && 416 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 417 for (auto *InstrPtr : InsInstrs) 418 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 419 for (auto *InstrPtr : DelInstrs) 420 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 421 422 Changed = true; 423 ++NumInstCombined; 424 425 Traces->invalidate(MBB); 426 Traces->verifyAnalysis(); 427 // Eagerly stop after the first pattern fires. 428 break; 429 } else { 430 // Cleanup instructions of the alternative code sequence. There is no 431 // use for them. 432 MachineFunction *MF = MBB->getParent(); 433 for (auto *InstrPtr : InsInstrs) 434 MF->DeleteMachineInstr(InstrPtr); 435 } 436 InstrIdxForVirtReg.clear(); 437 } 438 } 439 440 return Changed; 441 } 442 443 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 444 const TargetSubtargetInfo &STI = MF.getSubtarget(); 445 TII = STI.getInstrInfo(); 446 TRI = STI.getRegisterInfo(); 447 SchedModel = STI.getSchedModel(); 448 TSchedModel.init(SchedModel, &STI, TII); 449 MRI = &MF.getRegInfo(); 450 Traces = &getAnalysis<MachineTraceMetrics>(); 451 MinInstr = nullptr; 452 OptSize = MF.getFunction()->optForSize(); 453 454 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 455 if (!TII->useMachineCombiner()) { 456 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 457 return false; 458 } 459 460 bool Changed = false; 461 462 // Try to combine instructions. 463 for (auto &MBB : MF) 464 Changed |= combineInstructions(&MBB); 465 466 return Changed; 467 } 468