1 //===-- SIInsertSkips.cpp - Use predicates for control flow ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass inserts branches on the 0 exec mask over divergent branches 12 /// branches when it's expected that jumping over the untaken control flow will 13 /// be cheaper than having every workitem no-op through it. 14 // 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "SIInstrInfo.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/MC/MCAsmInfo.h" 25 26 using namespace llvm; 27 28 #define DEBUG_TYPE "si-insert-skips" 29 30 namespace { 31 32 static cl::opt<unsigned> SkipThresholdFlag( 33 "amdgpu-skip-threshold", 34 cl::desc("Number of instructions before jumping over divergent control flow"), 35 cl::init(12), cl::Hidden); 36 37 class SIInsertSkips : public MachineFunctionPass { 38 private: 39 const SIRegisterInfo *TRI; 40 const SIInstrInfo *TII; 41 unsigned SkipThreshold; 42 43 bool shouldSkip(const MachineBasicBlock &From, 44 const MachineBasicBlock &To) const; 45 46 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); 47 48 void kill(MachineInstr &MI); 49 50 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, 51 MachineBasicBlock::iterator I) const; 52 53 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); 54 55 public: 56 static char ID; 57 58 SIInsertSkips() : 59 MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } 60 61 bool runOnMachineFunction(MachineFunction &MF) override; 62 63 StringRef getPassName() const override { 64 return "SI insert s_cbranch_execz instructions"; 65 } 66 67 void getAnalysisUsage(AnalysisUsage &AU) const override { 68 MachineFunctionPass::getAnalysisUsage(AU); 69 } 70 }; 71 72 } // End anonymous namespace 73 74 char SIInsertSkips::ID = 0; 75 76 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, 77 "SI insert s_cbranch_execz instructions", false, false) 78 79 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; 80 81 static bool opcodeEmitsNoInsts(unsigned Opc) { 82 switch (Opc) { 83 case TargetOpcode::IMPLICIT_DEF: 84 case TargetOpcode::KILL: 85 case TargetOpcode::BUNDLE: 86 case TargetOpcode::CFI_INSTRUCTION: 87 case TargetOpcode::EH_LABEL: 88 case TargetOpcode::GC_LABEL: 89 case TargetOpcode::DBG_VALUE: 90 return true; 91 default: 92 return false; 93 } 94 } 95 96 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, 97 const MachineBasicBlock &To) const { 98 if (From.succ_empty()) 99 return false; 100 101 unsigned NumInstr = 0; 102 const MachineFunction *MF = From.getParent(); 103 104 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); 105 MBBI != End && MBBI != ToI; ++MBBI) { 106 const MachineBasicBlock &MBB = *MBBI; 107 108 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); 109 NumInstr < SkipThreshold && I != E; ++I) { 110 if (opcodeEmitsNoInsts(I->getOpcode())) 111 continue; 112 113 // FIXME: Since this is required for correctness, this should be inserted 114 // during SILowerControlFlow. 115 116 // When a uniform loop is inside non-uniform control flow, the branch 117 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken 118 // when EXEC = 0. We should skip the loop lest it becomes infinite. 119 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || 120 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) 121 return true; 122 123 if (I->isInlineAsm()) { 124 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 125 const char *AsmStr = I->getOperand(0).getSymbolName(); 126 127 // inlineasm length estimate is number of bytes assuming the longest 128 // instruction. 129 uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); 130 NumInstr += MaxAsmSize / MAI->getMaxInstLength(); 131 } else { 132 ++NumInstr; 133 } 134 135 if (NumInstr >= SkipThreshold) 136 return true; 137 } 138 } 139 140 return false; 141 } 142 143 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { 144 MachineBasicBlock &MBB = *MI.getParent(); 145 MachineFunction *MF = MBB.getParent(); 146 147 if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || 148 !shouldSkip(MBB, MBB.getParent()->back())) 149 return false; 150 151 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); 152 153 const DebugLoc &DL = MI.getDebugLoc(); 154 155 // If the exec mask is non-zero, skip the next two instructions 156 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 157 .addMBB(&NextBB); 158 159 MachineBasicBlock::iterator Insert = SkipBB->begin(); 160 161 // Exec mask is zero: Export to NULL target... 162 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) 163 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 164 .addReg(AMDGPU::VGPR0, RegState::Undef) 165 .addReg(AMDGPU::VGPR0, RegState::Undef) 166 .addReg(AMDGPU::VGPR0, RegState::Undef) 167 .addReg(AMDGPU::VGPR0, RegState::Undef) 168 .addImm(1) // vm 169 .addImm(0) // compr 170 .addImm(0); // en 171 172 // ... and terminate wavefront. 173 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 174 175 return true; 176 } 177 178 void SIInsertSkips::kill(MachineInstr &MI) { 179 MachineBasicBlock &MBB = *MI.getParent(); 180 DebugLoc DL = MI.getDebugLoc(); 181 const MachineOperand &Op = MI.getOperand(0); 182 183 #ifndef NDEBUG 184 CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); 185 // Kill is only allowed in pixel / geometry shaders. 186 assert(CallConv == CallingConv::AMDGPU_PS || 187 CallConv == CallingConv::AMDGPU_GS); 188 #endif 189 // Clear this thread from the exec mask if the operand is negative. 190 if (Op.isImm()) { 191 // Constant operand: Set exec mask to 0 or do nothing 192 if (Op.getImm() & 0x80000000) { 193 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 194 .addImm(0); 195 } 196 } else { 197 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) 198 .addImm(0) 199 .addOperand(Op); 200 } 201 } 202 203 MachineBasicBlock *SIInsertSkips::insertSkipBlock( 204 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { 205 MachineFunction *MF = MBB.getParent(); 206 207 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); 208 MachineFunction::iterator MBBI(MBB); 209 ++MBBI; 210 211 MF->insert(MBBI, SkipBB); 212 MBB.addSuccessor(SkipBB); 213 214 return SkipBB; 215 } 216 217 // Returns true if a branch over the block was inserted. 218 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, 219 MachineBasicBlock &SrcMBB) { 220 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); 221 222 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) 223 return false; 224 225 const DebugLoc &DL = MI.getDebugLoc(); 226 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); 227 228 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 229 .addMBB(DestBB); 230 231 return true; 232 } 233 234 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { 235 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 236 TII = ST.getInstrInfo(); 237 TRI = &TII->getRegisterInfo(); 238 SkipThreshold = SkipThresholdFlag; 239 240 bool HaveKill = false; 241 bool MadeChange = false; 242 243 // Track depth of exec mask, divergent branches. 244 SmallVector<MachineBasicBlock *, 16> ExecBranchStack; 245 246 MachineFunction::iterator NextBB; 247 248 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 249 250 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 251 BI != BE; BI = NextBB) { 252 NextBB = std::next(BI); 253 MachineBasicBlock &MBB = *BI; 254 255 if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { 256 // Reached convergence point for last divergent branch. 257 ExecBranchStack.pop_back(); 258 } 259 260 if (HaveKill && ExecBranchStack.empty()) { 261 HaveKill = false; 262 263 // TODO: Insert skip if exec is 0? 264 } 265 266 MachineBasicBlock::iterator I, Next; 267 for (I = MBB.begin(); I != MBB.end(); I = Next) { 268 Next = std::next(I); 269 270 MachineInstr &MI = *I; 271 272 switch (MI.getOpcode()) { 273 case AMDGPU::SI_MASK_BRANCH: { 274 ExecBranchStack.push_back(MI.getOperand(0).getMBB()); 275 MadeChange |= skipMaskBranch(MI, MBB); 276 break; 277 } 278 case AMDGPU::S_BRANCH: { 279 // Optimize out branches to the next block. 280 // FIXME: Shouldn't this be handled by BranchFolding? 281 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) 282 MI.eraseFromParent(); 283 break; 284 } 285 case AMDGPU::SI_KILL_TERMINATOR: { 286 MadeChange = true; 287 kill(MI); 288 289 if (ExecBranchStack.empty()) { 290 if (skipIfDead(MI, *NextBB)) { 291 NextBB = std::next(BI); 292 BE = MF.end(); 293 Next = MBB.end(); 294 } 295 } else { 296 HaveKill = true; 297 } 298 299 MI.eraseFromParent(); 300 break; 301 } 302 case AMDGPU::SI_RETURN: { 303 // FIXME: Should move somewhere else 304 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 305 306 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 307 // because external bytecode will be appended at the end. 308 if (BI != --MF.end() || I != MBB.getFirstTerminator()) { 309 // SI_RETURN is not the last instruction. Add an empty block at 310 // the end and jump there. 311 if (!EmptyMBBAtEnd) { 312 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 313 MF.insert(MF.end(), EmptyMBBAtEnd); 314 } 315 316 MBB.addSuccessor(EmptyMBBAtEnd); 317 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 318 .addMBB(EmptyMBBAtEnd); 319 I->eraseFromParent(); 320 } 321 } 322 default: 323 break; 324 } 325 } 326 } 327 328 return MadeChange; 329 } 330