1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass inserts branches on the 0 exec mask over divergent branches 12 /// branches when it's expected that jumping over the untaken control flow will 13 /// be cheaper than having every workitem no-op through it. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPU.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIInstrInfo.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/StringRef.h" 24 #include "llvm/CodeGen/MachineBasicBlock.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineFunctionPass.h" 27 #include "llvm/CodeGen/MachineInstr.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineOperand.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DebugLoc.h" 32 #include "llvm/MC/MCAsmInfo.h" 33 #include "llvm/Pass.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Target/TargetMachine.h" 36 #include <cassert> 37 #include <cstdint> 38 #include <iterator> 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "si-insert-skips" 43 44 static cl::opt<unsigned> SkipThresholdFlag( 45 "amdgpu-skip-threshold", 46 cl::desc("Number of instructions before jumping over divergent control flow"), 47 cl::init(12), cl::Hidden); 48 49 namespace { 50 51 class SIInsertSkips : public MachineFunctionPass { 52 private: 53 const SIRegisterInfo *TRI = nullptr; 54 const SIInstrInfo *TII = nullptr; 55 unsigned SkipThreshold = 0; 56 57 bool shouldSkip(const MachineBasicBlock &From, 58 const MachineBasicBlock &To) const; 59 60 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); 61 62 void kill(MachineInstr &MI); 63 64 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, 65 MachineBasicBlock::iterator I) const; 66 67 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); 68 69 bool optimizeVccBranch(MachineInstr &MI) const; 70 71 public: 72 static char ID; 73 74 SIInsertSkips() : MachineFunctionPass(ID) {} 75 76 bool runOnMachineFunction(MachineFunction &MF) override; 77 78 StringRef getPassName() const override { 79 return "SI insert s_cbranch_execz instructions"; 80 } 81 82 void getAnalysisUsage(AnalysisUsage &AU) const override { 83 MachineFunctionPass::getAnalysisUsage(AU); 84 } 85 }; 86 87 } // end anonymous namespace 88 89 char SIInsertSkips::ID = 0; 90 91 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, 92 "SI insert s_cbranch_execz instructions", false, false) 93 94 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; 95 96 static bool opcodeEmitsNoInsts(unsigned Opc) { 97 switch (Opc) { 98 case TargetOpcode::IMPLICIT_DEF: 99 case TargetOpcode::KILL: 100 case TargetOpcode::BUNDLE: 101 case TargetOpcode::CFI_INSTRUCTION: 102 case TargetOpcode::EH_LABEL: 103 case TargetOpcode::GC_LABEL: 104 case TargetOpcode::DBG_VALUE: 105 return true; 106 default: 107 return false; 108 } 109 } 110 111 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, 112 const MachineBasicBlock &To) const { 113 if (From.succ_empty()) 114 return false; 115 116 unsigned NumInstr = 0; 117 const MachineFunction *MF = From.getParent(); 118 119 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); 120 MBBI != End && MBBI != ToI; ++MBBI) { 121 const MachineBasicBlock &MBB = *MBBI; 122 123 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); 124 NumInstr < SkipThreshold && I != E; ++I) { 125 if (opcodeEmitsNoInsts(I->getOpcode())) 126 continue; 127 128 // FIXME: Since this is required for correctness, this should be inserted 129 // during SILowerControlFlow. 130 131 // When a uniform loop is inside non-uniform control flow, the branch 132 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken 133 // when EXEC = 0. We should skip the loop lest it becomes infinite. 134 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || 135 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) 136 return true; 137 138 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) 139 return true; 140 141 ++NumInstr; 142 if (NumInstr >= SkipThreshold) 143 return true; 144 } 145 } 146 147 return false; 148 } 149 150 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { 151 MachineBasicBlock &MBB = *MI.getParent(); 152 MachineFunction *MF = MBB.getParent(); 153 154 if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS || 155 !shouldSkip(MBB, MBB.getParent()->back())) 156 return false; 157 158 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); 159 160 const DebugLoc &DL = MI.getDebugLoc(); 161 162 // If the exec mask is non-zero, skip the next two instructions 163 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 164 .addMBB(&NextBB); 165 166 MachineBasicBlock::iterator Insert = SkipBB->begin(); 167 168 // Exec mask is zero: Export to NULL target... 169 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) 170 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 171 .addReg(AMDGPU::VGPR0, RegState::Undef) 172 .addReg(AMDGPU::VGPR0, RegState::Undef) 173 .addReg(AMDGPU::VGPR0, RegState::Undef) 174 .addReg(AMDGPU::VGPR0, RegState::Undef) 175 .addImm(1) // vm 176 .addImm(0) // compr 177 .addImm(0); // en 178 179 // ... and terminate wavefront. 180 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 181 182 return true; 183 } 184 185 void SIInsertSkips::kill(MachineInstr &MI) { 186 MachineBasicBlock &MBB = *MI.getParent(); 187 DebugLoc DL = MI.getDebugLoc(); 188 189 switch (MI.getOpcode()) { 190 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { 191 unsigned Opcode = 0; 192 193 // The opcodes are inverted because the inline immediate has to be 194 // the first operand, e.g. from "x < imm" to "imm > x" 195 switch (MI.getOperand(2).getImm()) { 196 case ISD::SETOEQ: 197 case ISD::SETEQ: 198 Opcode = AMDGPU::V_CMPX_EQ_F32_e64; 199 break; 200 case ISD::SETOGT: 201 case ISD::SETGT: 202 Opcode = AMDGPU::V_CMPX_LT_F32_e64; 203 break; 204 case ISD::SETOGE: 205 case ISD::SETGE: 206 Opcode = AMDGPU::V_CMPX_LE_F32_e64; 207 break; 208 case ISD::SETOLT: 209 case ISD::SETLT: 210 Opcode = AMDGPU::V_CMPX_GT_F32_e64; 211 break; 212 case ISD::SETOLE: 213 case ISD::SETLE: 214 Opcode = AMDGPU::V_CMPX_GE_F32_e64; 215 break; 216 case ISD::SETONE: 217 case ISD::SETNE: 218 Opcode = AMDGPU::V_CMPX_LG_F32_e64; 219 break; 220 case ISD::SETO: 221 Opcode = AMDGPU::V_CMPX_O_F32_e64; 222 break; 223 case ISD::SETUO: 224 Opcode = AMDGPU::V_CMPX_U_F32_e64; 225 break; 226 case ISD::SETUEQ: 227 Opcode = AMDGPU::V_CMPX_NLG_F32_e64; 228 break; 229 case ISD::SETUGT: 230 Opcode = AMDGPU::V_CMPX_NGE_F32_e64; 231 break; 232 case ISD::SETUGE: 233 Opcode = AMDGPU::V_CMPX_NGT_F32_e64; 234 break; 235 case ISD::SETULT: 236 Opcode = AMDGPU::V_CMPX_NLE_F32_e64; 237 break; 238 case ISD::SETULE: 239 Opcode = AMDGPU::V_CMPX_NLT_F32_e64; 240 break; 241 case ISD::SETUNE: 242 Opcode = AMDGPU::V_CMPX_NEQ_F32_e64; 243 break; 244 default: 245 llvm_unreachable("invalid ISD:SET cond code"); 246 } 247 248 assert(MI.getOperand(0).isReg()); 249 250 if (TRI->isVGPR(MBB.getParent()->getRegInfo(), 251 MI.getOperand(0).getReg())) { 252 Opcode = AMDGPU::getVOPe32(Opcode); 253 BuildMI(MBB, &MI, DL, TII->get(Opcode)) 254 .add(MI.getOperand(1)) 255 .add(MI.getOperand(0)); 256 } else { 257 BuildMI(MBB, &MI, DL, TII->get(Opcode)) 258 .addReg(AMDGPU::VCC, RegState::Define) 259 .addImm(0) // src0 modifiers 260 .add(MI.getOperand(1)) 261 .addImm(0) // src1 modifiers 262 .add(MI.getOperand(0)) 263 .addImm(0); // omod 264 } 265 break; 266 } 267 case AMDGPU::SI_KILL_I1_TERMINATOR: { 268 const MachineOperand &Op = MI.getOperand(0); 269 int64_t KillVal = MI.getOperand(1).getImm(); 270 assert(KillVal == 0 || KillVal == -1); 271 272 // Kill all threads if Op0 is an immediate and equal to the Kill value. 273 if (Op.isImm()) { 274 int64_t Imm = Op.getImm(); 275 assert(Imm == 0 || Imm == -1); 276 277 if (Imm == KillVal) 278 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 279 .addImm(0); 280 break; 281 } 282 283 unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; 284 BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC) 285 .addReg(AMDGPU::EXEC) 286 .add(Op); 287 break; 288 } 289 default: 290 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); 291 } 292 } 293 294 MachineBasicBlock *SIInsertSkips::insertSkipBlock( 295 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { 296 MachineFunction *MF = MBB.getParent(); 297 298 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); 299 MachineFunction::iterator MBBI(MBB); 300 ++MBBI; 301 302 MF->insert(MBBI, SkipBB); 303 MBB.addSuccessor(SkipBB); 304 305 return SkipBB; 306 } 307 308 // Returns true if a branch over the block was inserted. 309 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, 310 MachineBasicBlock &SrcMBB) { 311 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); 312 313 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) 314 return false; 315 316 const DebugLoc &DL = MI.getDebugLoc(); 317 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); 318 319 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 320 .addMBB(DestBB); 321 322 return true; 323 } 324 325 bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { 326 // Match: 327 // sreg = -1 328 // vcc = S_AND_B64 exec, sreg 329 // S_CBRANCH_VCC[N]Z 330 // => 331 // S_CBRANCH_EXEC[N]Z 332 bool Changed = false; 333 MachineBasicBlock &MBB = *MI.getParent(); 334 const unsigned CondReg = AMDGPU::VCC; 335 const unsigned ExecReg = AMDGPU::EXEC; 336 const unsigned And = AMDGPU::S_AND_B64; 337 338 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), 339 E = MBB.rend(); 340 bool ReadsCond = false; 341 unsigned Threshold = 5; 342 for (++A ; A != E ; ++A) { 343 if (!--Threshold) 344 return false; 345 if (A->modifiesRegister(ExecReg, TRI)) 346 return false; 347 if (A->modifiesRegister(CondReg, TRI)) { 348 if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) 349 return false; 350 break; 351 } 352 ReadsCond |= A->readsRegister(CondReg, TRI); 353 } 354 if (A == E) 355 return false; 356 357 MachineOperand &Op1 = A->getOperand(1); 358 MachineOperand &Op2 = A->getOperand(2); 359 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { 360 TII->commuteInstruction(*A); 361 Changed = true; 362 } 363 if (Op1.getReg() != ExecReg) 364 return Changed; 365 if (Op2.isImm() && Op2.getImm() != -1) 366 return Changed; 367 368 unsigned SReg = AMDGPU::NoRegister; 369 if (Op2.isReg()) { 370 SReg = Op2.getReg(); 371 auto M = std::next(A); 372 bool ReadsSreg = false; 373 for ( ; M != E ; ++M) { 374 if (M->definesRegister(SReg, TRI)) 375 break; 376 if (M->modifiesRegister(SReg, TRI)) 377 return Changed; 378 ReadsSreg |= M->readsRegister(SReg, TRI); 379 } 380 if (M == E || 381 !M->isMoveImmediate() || 382 !M->getOperand(1).isImm() || 383 M->getOperand(1).getImm() != -1) 384 return Changed; 385 // First if sreg is only used in and instruction fold the immediate 386 // into that and. 387 if (!ReadsSreg && Op2.isKill()) { 388 A->getOperand(2).ChangeToImmediate(-1); 389 M->eraseFromParent(); 390 } 391 } 392 393 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && 394 MI.killsRegister(CondReg, TRI)) 395 A->eraseFromParent(); 396 397 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; 398 if (SReg == ExecReg) { 399 if (IsVCCZ) { 400 MI.eraseFromParent(); 401 return true; 402 } 403 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 404 } else { 405 MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ 406 : AMDGPU::S_CBRANCH_EXECNZ)); 407 } 408 409 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); 410 MI.addImplicitDefUseOperands(*MBB.getParent()); 411 412 return true; 413 } 414 415 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { 416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 417 TII = ST.getInstrInfo(); 418 TRI = &TII->getRegisterInfo(); 419 SkipThreshold = SkipThresholdFlag; 420 421 bool HaveKill = false; 422 bool MadeChange = false; 423 424 // Track depth of exec mask, divergent branches. 425 SmallVector<MachineBasicBlock *, 16> ExecBranchStack; 426 427 MachineFunction::iterator NextBB; 428 429 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 430 431 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 432 BI != BE; BI = NextBB) { 433 NextBB = std::next(BI); 434 MachineBasicBlock &MBB = *BI; 435 bool HaveSkipBlock = false; 436 437 if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { 438 // Reached convergence point for last divergent branch. 439 ExecBranchStack.pop_back(); 440 } 441 442 if (HaveKill && ExecBranchStack.empty()) { 443 HaveKill = false; 444 445 // TODO: Insert skip if exec is 0? 446 } 447 448 MachineBasicBlock::iterator I, Next; 449 for (I = MBB.begin(); I != MBB.end(); I = Next) { 450 Next = std::next(I); 451 452 MachineInstr &MI = *I; 453 454 switch (MI.getOpcode()) { 455 case AMDGPU::SI_MASK_BRANCH: 456 ExecBranchStack.push_back(MI.getOperand(0).getMBB()); 457 MadeChange |= skipMaskBranch(MI, MBB); 458 break; 459 460 case AMDGPU::S_BRANCH: 461 // Optimize out branches to the next block. 462 // FIXME: Shouldn't this be handled by BranchFolding? 463 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { 464 MI.eraseFromParent(); 465 } else if (HaveSkipBlock) { 466 // Remove the given unconditional branch when a skip block has been 467 // inserted after the current one and let skip the two instructions 468 // performing the kill if the exec mask is non-zero. 469 MI.eraseFromParent(); 470 } 471 break; 472 473 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 474 case AMDGPU::SI_KILL_I1_TERMINATOR: 475 MadeChange = true; 476 kill(MI); 477 478 if (ExecBranchStack.empty()) { 479 if (NextBB != BE && skipIfDead(MI, *NextBB)) { 480 HaveSkipBlock = true; 481 NextBB = std::next(BI); 482 BE = MF.end(); 483 } 484 } else { 485 HaveKill = true; 486 } 487 488 MI.eraseFromParent(); 489 break; 490 491 case AMDGPU::SI_RETURN_TO_EPILOG: 492 // FIXME: Should move somewhere else 493 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 494 495 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 496 // because external bytecode will be appended at the end. 497 if (BI != --MF.end() || I != MBB.getFirstTerminator()) { 498 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at 499 // the end and jump there. 500 if (!EmptyMBBAtEnd) { 501 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 502 MF.insert(MF.end(), EmptyMBBAtEnd); 503 } 504 505 MBB.addSuccessor(EmptyMBBAtEnd); 506 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 507 .addMBB(EmptyMBBAtEnd); 508 I->eraseFromParent(); 509 } 510 break; 511 512 case AMDGPU::S_CBRANCH_VCCZ: 513 case AMDGPU::S_CBRANCH_VCCNZ: 514 MadeChange |= optimizeVccBranch(MI); 515 break; 516 517 default: 518 break; 519 } 520 } 521 } 522 523 return MadeChange; 524 } 525