1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass inserts branches on the 0 exec mask over divergent branches 12 /// branches when it's expected that jumping over the untaken control flow will 13 /// be cheaper than having every workitem no-op through it. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPU.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIInstrInfo.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/StringRef.h" 23 #include "llvm/CodeGen/MachineBasicBlock.h" 24 #include "llvm/CodeGen/MachineFunction.h" 25 #include "llvm/CodeGen/MachineFunctionPass.h" 26 #include "llvm/CodeGen/MachineInstr.h" 27 #include "llvm/CodeGen/MachineInstrBuilder.h" 28 #include "llvm/CodeGen/MachineOperand.h" 29 #include "llvm/IR/CallingConv.h" 30 #include "llvm/IR/DebugLoc.h" 31 #include "llvm/MC/MCAsmInfo.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Target/TargetMachine.h" 35 #include <cassert> 36 #include <cstdint> 37 #include <iterator> 38 39 using namespace llvm; 40 41 #define DEBUG_TYPE "si-insert-skips" 42 43 static cl::opt<unsigned> SkipThresholdFlag( 44 "amdgpu-skip-threshold", 45 cl::desc("Number of instructions before jumping over divergent control flow"), 46 cl::init(12), cl::Hidden); 47 48 namespace { 49 50 class SIInsertSkips : public MachineFunctionPass { 51 private: 52 const SIRegisterInfo *TRI = nullptr; 53 const SIInstrInfo *TII = nullptr; 54 unsigned SkipThreshold = 0; 55 56 bool shouldSkip(const MachineBasicBlock &From, 57 const MachineBasicBlock &To) const; 58 59 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); 60 61 void kill(MachineInstr &MI); 62 63 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, 64 MachineBasicBlock::iterator I) const; 65 66 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); 67 68 public: 69 static char ID; 70 71 SIInsertSkips() : MachineFunctionPass(ID) {} 72 73 bool runOnMachineFunction(MachineFunction &MF) override; 74 75 StringRef getPassName() const override { 76 return "SI insert s_cbranch_execz instructions"; 77 } 78 79 void getAnalysisUsage(AnalysisUsage &AU) const override { 80 MachineFunctionPass::getAnalysisUsage(AU); 81 } 82 }; 83 84 } // end anonymous namespace 85 86 char SIInsertSkips::ID = 0; 87 88 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, 89 "SI insert s_cbranch_execz instructions", false, false) 90 91 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; 92 93 static bool opcodeEmitsNoInsts(unsigned Opc) { 94 switch (Opc) { 95 case TargetOpcode::IMPLICIT_DEF: 96 case TargetOpcode::KILL: 97 case TargetOpcode::BUNDLE: 98 case TargetOpcode::CFI_INSTRUCTION: 99 case TargetOpcode::EH_LABEL: 100 case TargetOpcode::GC_LABEL: 101 case TargetOpcode::DBG_VALUE: 102 return true; 103 default: 104 return false; 105 } 106 } 107 108 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, 109 const MachineBasicBlock &To) const { 110 if (From.succ_empty()) 111 return false; 112 113 unsigned NumInstr = 0; 114 const MachineFunction *MF = From.getParent(); 115 116 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); 117 MBBI != End && MBBI != ToI; ++MBBI) { 118 const MachineBasicBlock &MBB = *MBBI; 119 120 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); 121 NumInstr < SkipThreshold && I != E; ++I) { 122 if (opcodeEmitsNoInsts(I->getOpcode())) 123 continue; 124 125 // FIXME: Since this is required for correctness, this should be inserted 126 // during SILowerControlFlow. 127 128 // When a uniform loop is inside non-uniform control flow, the branch 129 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken 130 // when EXEC = 0. We should skip the loop lest it becomes infinite. 131 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || 132 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) 133 return true; 134 135 if (I->isInlineAsm()) { 136 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 137 const char *AsmStr = I->getOperand(0).getSymbolName(); 138 139 // inlineasm length estimate is number of bytes assuming the longest 140 // instruction. 141 uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); 142 NumInstr += MaxAsmSize / MAI->getMaxInstLength(); 143 } else { 144 ++NumInstr; 145 } 146 147 if (NumInstr >= SkipThreshold) 148 return true; 149 } 150 } 151 152 return false; 153 } 154 155 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { 156 MachineBasicBlock &MBB = *MI.getParent(); 157 MachineFunction *MF = MBB.getParent(); 158 159 if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || 160 !shouldSkip(MBB, MBB.getParent()->back())) 161 return false; 162 163 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); 164 165 const DebugLoc &DL = MI.getDebugLoc(); 166 167 // If the exec mask is non-zero, skip the next two instructions 168 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 169 .addMBB(&NextBB); 170 171 MachineBasicBlock::iterator Insert = SkipBB->begin(); 172 173 // Exec mask is zero: Export to NULL target... 174 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) 175 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 176 .addReg(AMDGPU::VGPR0, RegState::Undef) 177 .addReg(AMDGPU::VGPR0, RegState::Undef) 178 .addReg(AMDGPU::VGPR0, RegState::Undef) 179 .addReg(AMDGPU::VGPR0, RegState::Undef) 180 .addImm(1) // vm 181 .addImm(0) // compr 182 .addImm(0); // en 183 184 // ... and terminate wavefront. 185 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 186 187 return true; 188 } 189 190 void SIInsertSkips::kill(MachineInstr &MI) { 191 MachineBasicBlock &MBB = *MI.getParent(); 192 DebugLoc DL = MI.getDebugLoc(); 193 const MachineOperand &Op = MI.getOperand(0); 194 195 #ifndef NDEBUG 196 CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); 197 // Kill is only allowed in pixel / geometry shaders. 198 assert(CallConv == CallingConv::AMDGPU_PS || 199 CallConv == CallingConv::AMDGPU_GS); 200 #endif 201 // Clear this thread from the exec mask if the operand is negative. 202 if (Op.isImm()) { 203 // Constant operand: Set exec mask to 0 or do nothing 204 if (Op.getImm() & 0x80000000) { 205 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 206 .addImm(0); 207 } 208 } else { 209 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) 210 .addImm(0) 211 .add(Op); 212 } 213 } 214 215 MachineBasicBlock *SIInsertSkips::insertSkipBlock( 216 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { 217 MachineFunction *MF = MBB.getParent(); 218 219 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); 220 MachineFunction::iterator MBBI(MBB); 221 ++MBBI; 222 223 MF->insert(MBBI, SkipBB); 224 MBB.addSuccessor(SkipBB); 225 226 return SkipBB; 227 } 228 229 // Returns true if a branch over the block was inserted. 230 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, 231 MachineBasicBlock &SrcMBB) { 232 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); 233 234 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) 235 return false; 236 237 const DebugLoc &DL = MI.getDebugLoc(); 238 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); 239 240 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 241 .addMBB(DestBB); 242 243 return true; 244 } 245 246 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { 247 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 248 TII = ST.getInstrInfo(); 249 TRI = &TII->getRegisterInfo(); 250 SkipThreshold = SkipThresholdFlag; 251 252 bool HaveKill = false; 253 bool MadeChange = false; 254 255 // Track depth of exec mask, divergent branches. 256 SmallVector<MachineBasicBlock *, 16> ExecBranchStack; 257 258 MachineFunction::iterator NextBB; 259 260 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 261 262 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 263 BI != BE; BI = NextBB) { 264 NextBB = std::next(BI); 265 MachineBasicBlock &MBB = *BI; 266 bool HaveSkipBlock = false; 267 268 if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { 269 // Reached convergence point for last divergent branch. 270 ExecBranchStack.pop_back(); 271 } 272 273 if (HaveKill && ExecBranchStack.empty()) { 274 HaveKill = false; 275 276 // TODO: Insert skip if exec is 0? 277 } 278 279 MachineBasicBlock::iterator I, Next; 280 for (I = MBB.begin(); I != MBB.end(); I = Next) { 281 Next = std::next(I); 282 283 MachineInstr &MI = *I; 284 285 switch (MI.getOpcode()) { 286 case AMDGPU::SI_MASK_BRANCH: 287 ExecBranchStack.push_back(MI.getOperand(0).getMBB()); 288 MadeChange |= skipMaskBranch(MI, MBB); 289 break; 290 291 case AMDGPU::S_BRANCH: 292 // Optimize out branches to the next block. 293 // FIXME: Shouldn't this be handled by BranchFolding? 294 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { 295 MI.eraseFromParent(); 296 } else if (HaveSkipBlock) { 297 // Remove the given unconditional branch when a skip block has been 298 // inserted after the current one and let skip the two instructions 299 // performing the kill if the exec mask is non-zero. 300 MI.eraseFromParent(); 301 } 302 break; 303 304 case AMDGPU::SI_KILL_TERMINATOR: 305 MadeChange = true; 306 kill(MI); 307 308 if (ExecBranchStack.empty()) { 309 if (skipIfDead(MI, *NextBB)) { 310 HaveSkipBlock = true; 311 NextBB = std::next(BI); 312 BE = MF.end(); 313 } 314 } else { 315 HaveKill = true; 316 } 317 318 MI.eraseFromParent(); 319 break; 320 321 case AMDGPU::SI_RETURN_TO_EPILOG: 322 // FIXME: Should move somewhere else 323 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 324 325 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 326 // because external bytecode will be appended at the end. 327 if (BI != --MF.end() || I != MBB.getFirstTerminator()) { 328 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at 329 // the end and jump there. 330 if (!EmptyMBBAtEnd) { 331 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 332 MF.insert(MF.end(), EmptyMBBAtEnd); 333 } 334 335 MBB.addSuccessor(EmptyMBBAtEnd); 336 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 337 .addMBB(EmptyMBBAtEnd); 338 I->eraseFromParent(); 339 } 340 break; 341 342 default: 343 break; 344 } 345 } 346 } 347 348 return MadeChange; 349 } 350