1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass compute turns all control flow pseudo instructions into native one 12 /// computing their address on the fly ; it also sets STACK_SIZE info. 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "R600Defines.h" 18 #include "R600InstrInfo.h" 19 #include "R600MachineFunctionInfo.h" 20 #include "R600RegisterInfo.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/StringRef.h" 24 #include "llvm/CodeGen/MachineBasicBlock.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineFunctionPass.h" 27 #include "llvm/CodeGen/MachineInstr.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineOperand.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DebugLoc.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/MathExtras.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include <algorithm> 36 #include <cassert> 37 #include <cstdint> 38 #include <new> 39 #include <set> 40 #include <utility> 41 #include <vector> 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "r600cf" 46 47 namespace { 48 49 struct CFStack { 50 51 enum StackItem { 52 ENTRY = 0, 53 SUB_ENTRY = 1, 54 FIRST_NON_WQM_PUSH = 2, 55 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 56 }; 57 58 const R600Subtarget *ST; 59 std::vector<StackItem> BranchStack; 60 std::vector<StackItem> LoopStack; 61 unsigned MaxStackSize; 62 unsigned CurrentEntries = 0; 63 unsigned CurrentSubEntries = 0; 64 65 CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st), 66 // We need to reserve a stack entry for CALL_FS in vertex shaders. 67 MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {} 68 69 unsigned getLoopDepth(); 70 bool branchStackContains(CFStack::StackItem); 71 bool requiresWorkAroundForInst(unsigned Opcode); 72 unsigned getSubEntrySize(CFStack::StackItem Item); 73 void updateMaxStackSize(); 74 void pushBranch(unsigned Opcode, bool isWQM = false); 75 void pushLoop(); 76 void popBranch(); 77 void popLoop(); 78 }; 79 80 unsigned CFStack::getLoopDepth() { 81 return LoopStack.size(); 82 } 83 84 bool CFStack::branchStackContains(CFStack::StackItem Item) { 85 for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), 86 E = BranchStack.end(); I != E; ++I) { 87 if (*I == Item) 88 return true; 89 } 90 return false; 91 } 92 93 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { 94 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && 95 getLoopDepth() > 1) 96 return true; 97 98 if (!ST->hasCFAluBug()) 99 return false; 100 101 switch(Opcode) { 102 default: return false; 103 case AMDGPU::CF_ALU_PUSH_BEFORE: 104 case AMDGPU::CF_ALU_ELSE_AFTER: 105 case AMDGPU::CF_ALU_BREAK: 106 case AMDGPU::CF_ALU_CONTINUE: 107 if (CurrentSubEntries == 0) 108 return false; 109 if (ST->getWavefrontSize() == 64) { 110 // We are being conservative here. We only require this work-around if 111 // CurrentSubEntries > 3 && 112 // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) 113 // 114 // We have to be conservative, because we don't know for certain that 115 // our stack allocation algorithm for Evergreen/NI is correct. Applying this 116 // work-around when CurrentSubEntries > 3 allows us to over-allocate stack 117 // resources without any problems. 118 return CurrentSubEntries > 3; 119 } else { 120 assert(ST->getWavefrontSize() == 32); 121 // We are being conservative here. We only require the work-around if 122 // CurrentSubEntries > 7 && 123 // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) 124 // See the comment on the wavefront size == 64 case for why we are 125 // being conservative. 126 return CurrentSubEntries > 7; 127 } 128 } 129 } 130 131 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { 132 switch(Item) { 133 default: 134 return 0; 135 case CFStack::FIRST_NON_WQM_PUSH: 136 assert(!ST->hasCaymanISA()); 137 if (ST->getGeneration() <= R600Subtarget::R700) { 138 // +1 For the push operation. 139 // +2 Extra space required. 140 return 3; 141 } else { 142 // Some documentation says that this is not necessary on Evergreen, 143 // but experimentation has show that we need to allocate 1 extra 144 // sub-entry for the first non-WQM push. 145 // +1 For the push operation. 146 // +1 Extra space required. 147 return 2; 148 } 149 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: 150 assert(ST->getGeneration() >= R600Subtarget::EVERGREEN); 151 // +1 For the push operation. 152 // +1 Extra space required. 153 return 2; 154 case CFStack::SUB_ENTRY: 155 return 1; 156 } 157 } 158 159 void CFStack::updateMaxStackSize() { 160 unsigned CurrentStackSize = 161 CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4); 162 MaxStackSize = std::max(CurrentStackSize, MaxStackSize); 163 } 164 165 void CFStack::pushBranch(unsigned Opcode, bool isWQM) { 166 CFStack::StackItem Item = CFStack::ENTRY; 167 switch(Opcode) { 168 case AMDGPU::CF_PUSH_EG: 169 case AMDGPU::CF_ALU_PUSH_BEFORE: 170 if (!isWQM) { 171 if (!ST->hasCaymanISA() && 172 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) 173 Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI 174 // See comment in 175 // CFStack::getSubEntrySize() 176 else if (CurrentEntries > 0 && 177 ST->getGeneration() > R600Subtarget::EVERGREEN && 178 !ST->hasCaymanISA() && 179 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) 180 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; 181 else 182 Item = CFStack::SUB_ENTRY; 183 } else 184 Item = CFStack::ENTRY; 185 break; 186 } 187 BranchStack.push_back(Item); 188 if (Item == CFStack::ENTRY) 189 CurrentEntries++; 190 else 191 CurrentSubEntries += getSubEntrySize(Item); 192 updateMaxStackSize(); 193 } 194 195 void CFStack::pushLoop() { 196 LoopStack.push_back(CFStack::ENTRY); 197 CurrentEntries++; 198 updateMaxStackSize(); 199 } 200 201 void CFStack::popBranch() { 202 CFStack::StackItem Top = BranchStack.back(); 203 if (Top == CFStack::ENTRY) 204 CurrentEntries--; 205 else 206 CurrentSubEntries-= getSubEntrySize(Top); 207 BranchStack.pop_back(); 208 } 209 210 void CFStack::popLoop() { 211 CurrentEntries--; 212 LoopStack.pop_back(); 213 } 214 215 class R600ControlFlowFinalizer : public MachineFunctionPass { 216 private: 217 typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile; 218 219 enum ControlFlowInstruction { 220 CF_TC, 221 CF_VC, 222 CF_CALL_FS, 223 CF_WHILE_LOOP, 224 CF_END_LOOP, 225 CF_LOOP_BREAK, 226 CF_LOOP_CONTINUE, 227 CF_JUMP, 228 CF_ELSE, 229 CF_POP, 230 CF_END 231 }; 232 233 static char ID; 234 const R600InstrInfo *TII = nullptr; 235 const R600RegisterInfo *TRI = nullptr; 236 unsigned MaxFetchInst; 237 const R600Subtarget *ST = nullptr; 238 239 bool IsTrivialInst(MachineInstr &MI) const { 240 switch (MI.getOpcode()) { 241 case AMDGPU::KILL: 242 case AMDGPU::RETURN: 243 return true; 244 default: 245 return false; 246 } 247 } 248 249 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { 250 unsigned Opcode = 0; 251 bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN); 252 switch (CFI) { 253 case CF_TC: 254 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; 255 break; 256 case CF_VC: 257 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; 258 break; 259 case CF_CALL_FS: 260 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; 261 break; 262 case CF_WHILE_LOOP: 263 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; 264 break; 265 case CF_END_LOOP: 266 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; 267 break; 268 case CF_LOOP_BREAK: 269 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; 270 break; 271 case CF_LOOP_CONTINUE: 272 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; 273 break; 274 case CF_JUMP: 275 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; 276 break; 277 case CF_ELSE: 278 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; 279 break; 280 case CF_POP: 281 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; 282 break; 283 case CF_END: 284 if (ST->hasCaymanISA()) { 285 Opcode = AMDGPU::CF_END_CM; 286 break; 287 } 288 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; 289 break; 290 } 291 assert (Opcode && "No opcode selected"); 292 return TII->get(Opcode); 293 } 294 295 bool isCompatibleWithClause(const MachineInstr &MI, 296 std::set<unsigned> &DstRegs) const { 297 unsigned DstMI, SrcMI; 298 for (MachineInstr::const_mop_iterator I = MI.operands_begin(), 299 E = MI.operands_end(); 300 I != E; ++I) { 301 const MachineOperand &MO = *I; 302 if (!MO.isReg()) 303 continue; 304 if (MO.isDef()) { 305 unsigned Reg = MO.getReg(); 306 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 307 DstMI = Reg; 308 else 309 DstMI = TRI->getMatchingSuperReg(Reg, 310 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 311 &AMDGPU::R600_Reg128RegClass); 312 } 313 if (MO.isUse()) { 314 unsigned Reg = MO.getReg(); 315 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 316 SrcMI = Reg; 317 else 318 SrcMI = TRI->getMatchingSuperReg(Reg, 319 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 320 &AMDGPU::R600_Reg128RegClass); 321 } 322 } 323 if ((DstRegs.find(SrcMI) == DstRegs.end())) { 324 DstRegs.insert(DstMI); 325 return true; 326 } else 327 return false; 328 } 329 330 ClauseFile 331 MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 332 const { 333 MachineBasicBlock::iterator ClauseHead = I; 334 std::vector<MachineInstr *> ClauseContent; 335 unsigned AluInstCount = 0; 336 bool IsTex = TII->usesTextureCache(*ClauseHead); 337 std::set<unsigned> DstRegs; 338 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 339 if (IsTrivialInst(*I)) 340 continue; 341 if (AluInstCount >= MaxFetchInst) 342 break; 343 if ((IsTex && !TII->usesTextureCache(*I)) || 344 (!IsTex && !TII->usesVertexCache(*I))) 345 break; 346 if (!isCompatibleWithClause(*I, DstRegs)) 347 break; 348 AluInstCount ++; 349 ClauseContent.push_back(&*I); 350 } 351 MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), 352 getHWInstrDesc(IsTex?CF_TC:CF_VC)) 353 .addImm(0) // ADDR 354 .addImm(AluInstCount - 1); // COUNT 355 return ClauseFile(MIb, std::move(ClauseContent)); 356 } 357 358 void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const { 359 static const unsigned LiteralRegs[] = { 360 AMDGPU::ALU_LITERAL_X, 361 AMDGPU::ALU_LITERAL_Y, 362 AMDGPU::ALU_LITERAL_Z, 363 AMDGPU::ALU_LITERAL_W 364 }; 365 const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = 366 TII->getSrcs(MI); 367 for (const auto &Src:Srcs) { 368 if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X) 369 continue; 370 int64_t Imm = Src.second; 371 std::vector<MachineOperand *>::iterator It = 372 llvm::find_if(Lits, [&](MachineOperand *val) { 373 return val->isImm() && (val->getImm() == Imm); 374 }); 375 376 // Get corresponding Operand 377 MachineOperand &Operand = MI.getOperand( 378 TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); 379 380 if (It != Lits.end()) { 381 // Reuse existing literal reg 382 unsigned Index = It - Lits.begin(); 383 Src.first->setReg(LiteralRegs[Index]); 384 } else { 385 // Allocate new literal reg 386 assert(Lits.size() < 4 && "Too many literals in Instruction Group"); 387 Src.first->setReg(LiteralRegs[Lits.size()]); 388 Lits.push_back(&Operand); 389 } 390 } 391 } 392 393 MachineBasicBlock::iterator insertLiterals( 394 MachineBasicBlock::iterator InsertPos, 395 const std::vector<unsigned> &Literals) const { 396 MachineBasicBlock *MBB = InsertPos->getParent(); 397 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 398 unsigned LiteralPair0 = Literals[i]; 399 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; 400 InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), 401 TII->get(AMDGPU::LITERALS)) 402 .addImm(LiteralPair0) 403 .addImm(LiteralPair1); 404 } 405 return InsertPos; 406 } 407 408 ClauseFile 409 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 410 const { 411 MachineInstr &ClauseHead = *I; 412 std::vector<MachineInstr *> ClauseContent; 413 I++; 414 for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { 415 if (IsTrivialInst(*I)) { 416 ++I; 417 continue; 418 } 419 if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) 420 break; 421 std::vector<MachineOperand *>Literals; 422 if (I->isBundle()) { 423 MachineInstr &DeleteMI = *I; 424 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 425 while (++BI != E && BI->isBundledWithPred()) { 426 BI->unbundleFromPred(); 427 for (MachineOperand &MO : BI->operands()) { 428 if (MO.isReg() && MO.isInternalRead()) 429 MO.setIsInternalRead(false); 430 } 431 getLiteral(*BI, Literals); 432 ClauseContent.push_back(&*BI); 433 } 434 I = BI; 435 DeleteMI.eraseFromParent(); 436 } else { 437 getLiteral(*I, Literals); 438 ClauseContent.push_back(&*I); 439 I++; 440 } 441 for (unsigned i = 0, e = Literals.size(); i < e; i += 2) { 442 MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(), 443 TII->get(AMDGPU::LITERALS)); 444 if (Literals[i]->isImm()) { 445 MILit.addImm(Literals[i]->getImm()); 446 } else { 447 MILit.addGlobalAddress(Literals[i]->getGlobal(), 448 Literals[i]->getOffset()); 449 } 450 if (i + 1 < e) { 451 if (Literals[i + 1]->isImm()) { 452 MILit.addImm(Literals[i + 1]->getImm()); 453 } else { 454 MILit.addGlobalAddress(Literals[i + 1]->getGlobal(), 455 Literals[i + 1]->getOffset()); 456 } 457 } else 458 MILit.addImm(0); 459 ClauseContent.push_back(MILit); 460 } 461 } 462 assert(ClauseContent.size() < 128 && "ALU clause is too big"); 463 ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1); 464 return ClauseFile(&ClauseHead, std::move(ClauseContent)); 465 } 466 467 void EmitFetchClause(MachineBasicBlock::iterator InsertPos, 468 const DebugLoc &DL, ClauseFile &Clause, 469 unsigned &CfCount) { 470 CounterPropagateAddr(*Clause.first, CfCount); 471 MachineBasicBlock *BB = Clause.first->getParent(); 472 BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount); 473 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 474 BB->splice(InsertPos, BB, Clause.second[i]); 475 } 476 CfCount += 2 * Clause.second.size(); 477 } 478 479 void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL, 480 ClauseFile &Clause, unsigned &CfCount) { 481 Clause.first->getOperand(0).setImm(0); 482 CounterPropagateAddr(*Clause.first, CfCount); 483 MachineBasicBlock *BB = Clause.first->getParent(); 484 BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount); 485 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 486 BB->splice(InsertPos, BB, Clause.second[i]); 487 } 488 CfCount += Clause.second.size(); 489 } 490 491 void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const { 492 MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm()); 493 } 494 void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, 495 unsigned Addr) const { 496 for (MachineInstr *MI : MIs) { 497 CounterPropagateAddr(*MI, Addr); 498 } 499 } 500 501 public: 502 R600ControlFlowFinalizer() : MachineFunctionPass(ID) {} 503 504 bool runOnMachineFunction(MachineFunction &MF) override { 505 ST = &MF.getSubtarget<R600Subtarget>(); 506 MaxFetchInst = ST->getTexVTXClauseSize(); 507 TII = ST->getInstrInfo(); 508 TRI = ST->getRegisterInfo(); 509 510 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 511 512 CFStack CFStack(ST, MF.getFunction()->getCallingConv()); 513 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; 514 ++MB) { 515 MachineBasicBlock &MBB = *MB; 516 unsigned CfCount = 0; 517 std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack; 518 std::vector<MachineInstr * > IfThenElseStack; 519 if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { 520 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), 521 getHWInstrDesc(CF_CALL_FS)); 522 CfCount++; 523 } 524 std::vector<ClauseFile> FetchClauses, AluClauses; 525 std::vector<MachineInstr *> LastAlu(1); 526 std::vector<MachineInstr *> ToPopAfter; 527 528 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 529 I != E;) { 530 if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) { 531 DEBUG(dbgs() << CfCount << ":"; I->dump();); 532 FetchClauses.push_back(MakeFetchClause(MBB, I)); 533 CfCount++; 534 LastAlu.back() = nullptr; 535 continue; 536 } 537 538 MachineBasicBlock::iterator MI = I; 539 if (MI->getOpcode() != AMDGPU::ENDIF) 540 LastAlu.back() = nullptr; 541 if (MI->getOpcode() == AMDGPU::CF_ALU) 542 LastAlu.back() = &*MI; 543 I++; 544 bool RequiresWorkAround = 545 CFStack.requiresWorkAroundForInst(MI->getOpcode()); 546 switch (MI->getOpcode()) { 547 case AMDGPU::CF_ALU_PUSH_BEFORE: 548 if (RequiresWorkAround) { 549 DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); 550 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) 551 .addImm(CfCount + 1) 552 .addImm(1); 553 MI->setDesc(TII->get(AMDGPU::CF_ALU)); 554 CfCount++; 555 CFStack.pushBranch(AMDGPU::CF_PUSH_EG); 556 } else 557 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); 558 LLVM_FALLTHROUGH; 559 case AMDGPU::CF_ALU: 560 I = MI; 561 AluClauses.push_back(MakeALUClause(MBB, I)); 562 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 563 CfCount++; 564 break; 565 case AMDGPU::WHILELOOP: { 566 CFStack.pushLoop(); 567 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 568 getHWInstrDesc(CF_WHILE_LOOP)) 569 .addImm(1); 570 std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount, 571 std::set<MachineInstr *>()); 572 Pair.second.insert(MIb); 573 LoopStack.push_back(std::move(Pair)); 574 MI->eraseFromParent(); 575 CfCount++; 576 break; 577 } 578 case AMDGPU::ENDLOOP: { 579 CFStack.popLoop(); 580 std::pair<unsigned, std::set<MachineInstr *>> Pair = 581 std::move(LoopStack.back()); 582 LoopStack.pop_back(); 583 CounterPropagateAddr(Pair.second, CfCount); 584 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) 585 .addImm(Pair.first + 1); 586 MI->eraseFromParent(); 587 CfCount++; 588 break; 589 } 590 case AMDGPU::IF_PREDICATE_SET: { 591 LastAlu.push_back(nullptr); 592 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 593 getHWInstrDesc(CF_JUMP)) 594 .addImm(0) 595 .addImm(0); 596 IfThenElseStack.push_back(MIb); 597 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 598 MI->eraseFromParent(); 599 CfCount++; 600 break; 601 } 602 case AMDGPU::ELSE: { 603 MachineInstr * JumpInst = IfThenElseStack.back(); 604 IfThenElseStack.pop_back(); 605 CounterPropagateAddr(*JumpInst, CfCount); 606 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 607 getHWInstrDesc(CF_ELSE)) 608 .addImm(0) 609 .addImm(0); 610 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 611 IfThenElseStack.push_back(MIb); 612 MI->eraseFromParent(); 613 CfCount++; 614 break; 615 } 616 case AMDGPU::ENDIF: { 617 CFStack.popBranch(); 618 if (LastAlu.back()) { 619 ToPopAfter.push_back(LastAlu.back()); 620 } else { 621 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 622 getHWInstrDesc(CF_POP)) 623 .addImm(CfCount + 1) 624 .addImm(1); 625 (void)MIb; 626 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 627 CfCount++; 628 } 629 630 MachineInstr *IfOrElseInst = IfThenElseStack.back(); 631 IfThenElseStack.pop_back(); 632 CounterPropagateAddr(*IfOrElseInst, CfCount); 633 IfOrElseInst->getOperand(1).setImm(1); 634 LastAlu.pop_back(); 635 MI->eraseFromParent(); 636 break; 637 } 638 case AMDGPU::BREAK: { 639 CfCount ++; 640 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 641 getHWInstrDesc(CF_LOOP_BREAK)) 642 .addImm(0); 643 LoopStack.back().second.insert(MIb); 644 MI->eraseFromParent(); 645 break; 646 } 647 case AMDGPU::CONTINUE: { 648 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 649 getHWInstrDesc(CF_LOOP_CONTINUE)) 650 .addImm(0); 651 LoopStack.back().second.insert(MIb); 652 MI->eraseFromParent(); 653 CfCount++; 654 break; 655 } 656 case AMDGPU::RETURN: { 657 DebugLoc DL = MBB.findDebugLoc(MI); 658 BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END)); 659 CfCount++; 660 if (CfCount % 2) { 661 BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD)); 662 CfCount++; 663 } 664 MI->eraseFromParent(); 665 for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) 666 EmitFetchClause(I, DL, FetchClauses[i], CfCount); 667 for (unsigned i = 0, e = AluClauses.size(); i < e; i++) 668 EmitALUClause(I, DL, AluClauses[i], CfCount); 669 break; 670 } 671 default: 672 if (TII->isExport(MI->getOpcode())) { 673 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 674 CfCount++; 675 } 676 break; 677 } 678 } 679 for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { 680 MachineInstr *Alu = ToPopAfter[i]; 681 BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), 682 TII->get(AMDGPU::CF_ALU_POP_AFTER)) 683 .addImm(Alu->getOperand(0).getImm()) 684 .addImm(Alu->getOperand(1).getImm()) 685 .addImm(Alu->getOperand(2).getImm()) 686 .addImm(Alu->getOperand(3).getImm()) 687 .addImm(Alu->getOperand(4).getImm()) 688 .addImm(Alu->getOperand(5).getImm()) 689 .addImm(Alu->getOperand(6).getImm()) 690 .addImm(Alu->getOperand(7).getImm()) 691 .addImm(Alu->getOperand(8).getImm()); 692 Alu->eraseFromParent(); 693 } 694 MFI->CFStackSize = CFStack.MaxStackSize; 695 } 696 697 return false; 698 } 699 700 StringRef getPassName() const override { 701 return "R600 Control Flow Finalizer Pass"; 702 } 703 }; 704 705 char R600ControlFlowFinalizer::ID = 0; 706 707 } // end anonymous namespace 708 709 FunctionPass *llvm::createR600ControlFlowFinalizer() { 710 return new R600ControlFlowFinalizer(); 711 } 712