1 //===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass compute turns all control flow pseudo instructions into native one 12 /// computing their address on the fly; it also sets STACK_SIZE info. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "R600Defines.h" 19 #include "R600InstrInfo.h" 20 #include "R600MachineFunctionInfo.h" 21 #include "R600RegisterInfo.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/MachineBasicBlock.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineInstr.h" 29 #include "llvm/CodeGen/MachineInstrBuilder.h" 30 #include "llvm/CodeGen/MachineOperand.h" 31 #include "llvm/IR/CallingConv.h" 32 #include "llvm/IR/DebugLoc.h" 33 #include "llvm/IR/Function.h" 34 #include "llvm/Pass.h" 35 #include "llvm/Support/Compiler.h" 36 #include "llvm/Support/Debug.h" 37 #include "llvm/Support/MathExtras.h" 38 #include "llvm/Support/raw_ostream.h" 39 #include <algorithm> 40 #include <cassert> 41 #include <cstdint> 42 #include <set> 43 #include <utility> 44 #include <vector> 45 46 using namespace llvm; 47 48 #define DEBUG_TYPE "r600cf" 49 50 namespace { 51 52 struct CFStack { 53 enum StackItem { 54 ENTRY = 0, 55 SUB_ENTRY = 1, 56 FIRST_NON_WQM_PUSH = 2, 57 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 58 }; 59 60 const R600Subtarget *ST; 61 std::vector<StackItem> BranchStack; 62 std::vector<StackItem> LoopStack; 63 unsigned MaxStackSize; 64 unsigned CurrentEntries = 0; 65 unsigned CurrentSubEntries = 0; 66 67 CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st), 68 // We need to reserve a stack entry for CALL_FS in vertex shaders. 69 MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {} 70 71 unsigned getLoopDepth(); 72 bool branchStackContains(CFStack::StackItem); 73 bool requiresWorkAroundForInst(unsigned Opcode); 74 unsigned getSubEntrySize(CFStack::StackItem Item); 75 void updateMaxStackSize(); 76 void pushBranch(unsigned Opcode, bool isWQM = false); 77 void pushLoop(); 78 void popBranch(); 79 void popLoop(); 80 }; 81 82 unsigned CFStack::getLoopDepth() { 83 return LoopStack.size(); 84 } 85 86 bool CFStack::branchStackContains(CFStack::StackItem Item) { 87 for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), 88 E = BranchStack.end(); I != E; ++I) { 89 if (*I == Item) 90 return true; 91 } 92 return false; 93 } 94 95 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { 96 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && 97 getLoopDepth() > 1) 98 return true; 99 100 if (!ST->hasCFAluBug()) 101 return false; 102 103 switch(Opcode) { 104 default: return false; 105 case AMDGPU::CF_ALU_PUSH_BEFORE: 106 case AMDGPU::CF_ALU_ELSE_AFTER: 107 case AMDGPU::CF_ALU_BREAK: 108 case AMDGPU::CF_ALU_CONTINUE: 109 if (CurrentSubEntries == 0) 110 return false; 111 if (ST->getWavefrontSize() == 64) { 112 // We are being conservative here. We only require this work-around if 113 // CurrentSubEntries > 3 && 114 // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) 115 // 116 // We have to be conservative, because we don't know for certain that 117 // our stack allocation algorithm for Evergreen/NI is correct. Applying this 118 // work-around when CurrentSubEntries > 3 allows us to over-allocate stack 119 // resources without any problems. 120 return CurrentSubEntries > 3; 121 } else { 122 assert(ST->getWavefrontSize() == 32); 123 // We are being conservative here. We only require the work-around if 124 // CurrentSubEntries > 7 && 125 // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) 126 // See the comment on the wavefront size == 64 case for why we are 127 // being conservative. 128 return CurrentSubEntries > 7; 129 } 130 } 131 } 132 133 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { 134 switch(Item) { 135 default: 136 return 0; 137 case CFStack::FIRST_NON_WQM_PUSH: 138 assert(!ST->hasCaymanISA()); 139 if (ST->getGeneration() <= R600Subtarget::R700) { 140 // +1 For the push operation. 141 // +2 Extra space required. 142 return 3; 143 } else { 144 // Some documentation says that this is not necessary on Evergreen, 145 // but experimentation has show that we need to allocate 1 extra 146 // sub-entry for the first non-WQM push. 147 // +1 For the push operation. 148 // +1 Extra space required. 149 return 2; 150 } 151 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: 152 assert(ST->getGeneration() >= R600Subtarget::EVERGREEN); 153 // +1 For the push operation. 154 // +1 Extra space required. 155 return 2; 156 case CFStack::SUB_ENTRY: 157 return 1; 158 } 159 } 160 161 void CFStack::updateMaxStackSize() { 162 unsigned CurrentStackSize = 163 CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4); 164 MaxStackSize = std::max(CurrentStackSize, MaxStackSize); 165 } 166 167 void CFStack::pushBranch(unsigned Opcode, bool isWQM) { 168 CFStack::StackItem Item = CFStack::ENTRY; 169 switch(Opcode) { 170 case AMDGPU::CF_PUSH_EG: 171 case AMDGPU::CF_ALU_PUSH_BEFORE: 172 if (!isWQM) { 173 if (!ST->hasCaymanISA() && 174 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) 175 Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI 176 // See comment in 177 // CFStack::getSubEntrySize() 178 else if (CurrentEntries > 0 && 179 ST->getGeneration() > R600Subtarget::EVERGREEN && 180 !ST->hasCaymanISA() && 181 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) 182 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; 183 else 184 Item = CFStack::SUB_ENTRY; 185 } else 186 Item = CFStack::ENTRY; 187 break; 188 } 189 BranchStack.push_back(Item); 190 if (Item == CFStack::ENTRY) 191 CurrentEntries++; 192 else 193 CurrentSubEntries += getSubEntrySize(Item); 194 updateMaxStackSize(); 195 } 196 197 void CFStack::pushLoop() { 198 LoopStack.push_back(CFStack::ENTRY); 199 CurrentEntries++; 200 updateMaxStackSize(); 201 } 202 203 void CFStack::popBranch() { 204 CFStack::StackItem Top = BranchStack.back(); 205 if (Top == CFStack::ENTRY) 206 CurrentEntries--; 207 else 208 CurrentSubEntries-= getSubEntrySize(Top); 209 BranchStack.pop_back(); 210 } 211 212 void CFStack::popLoop() { 213 CurrentEntries--; 214 LoopStack.pop_back(); 215 } 216 217 class R600ControlFlowFinalizer : public MachineFunctionPass { 218 private: 219 using ClauseFile = std::pair<MachineInstr *, std::vector<MachineInstr *>>; 220 221 enum ControlFlowInstruction { 222 CF_TC, 223 CF_VC, 224 CF_CALL_FS, 225 CF_WHILE_LOOP, 226 CF_END_LOOP, 227 CF_LOOP_BREAK, 228 CF_LOOP_CONTINUE, 229 CF_JUMP, 230 CF_ELSE, 231 CF_POP, 232 CF_END 233 }; 234 235 const R600InstrInfo *TII = nullptr; 236 const R600RegisterInfo *TRI = nullptr; 237 unsigned MaxFetchInst; 238 const R600Subtarget *ST = nullptr; 239 240 bool IsTrivialInst(MachineInstr &MI) const { 241 switch (MI.getOpcode()) { 242 case AMDGPU::KILL: 243 case AMDGPU::RETURN: 244 return true; 245 default: 246 return false; 247 } 248 } 249 250 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { 251 unsigned Opcode = 0; 252 bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN); 253 switch (CFI) { 254 case CF_TC: 255 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; 256 break; 257 case CF_VC: 258 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; 259 break; 260 case CF_CALL_FS: 261 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; 262 break; 263 case CF_WHILE_LOOP: 264 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; 265 break; 266 case CF_END_LOOP: 267 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; 268 break; 269 case CF_LOOP_BREAK: 270 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; 271 break; 272 case CF_LOOP_CONTINUE: 273 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; 274 break; 275 case CF_JUMP: 276 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; 277 break; 278 case CF_ELSE: 279 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; 280 break; 281 case CF_POP: 282 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; 283 break; 284 case CF_END: 285 if (ST->hasCaymanISA()) { 286 Opcode = AMDGPU::CF_END_CM; 287 break; 288 } 289 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; 290 break; 291 } 292 assert (Opcode && "No opcode selected"); 293 return TII->get(Opcode); 294 } 295 296 bool isCompatibleWithClause(const MachineInstr &MI, 297 std::set<unsigned> &DstRegs) const { 298 unsigned DstMI, SrcMI; 299 for (MachineInstr::const_mop_iterator I = MI.operands_begin(), 300 E = MI.operands_end(); 301 I != E; ++I) { 302 const MachineOperand &MO = *I; 303 if (!MO.isReg()) 304 continue; 305 if (MO.isDef()) { 306 unsigned Reg = MO.getReg(); 307 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 308 DstMI = Reg; 309 else 310 DstMI = TRI->getMatchingSuperReg(Reg, 311 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 312 &AMDGPU::R600_Reg128RegClass); 313 } 314 if (MO.isUse()) { 315 unsigned Reg = MO.getReg(); 316 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 317 SrcMI = Reg; 318 else 319 SrcMI = TRI->getMatchingSuperReg(Reg, 320 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 321 &AMDGPU::R600_Reg128RegClass); 322 } 323 } 324 if ((DstRegs.find(SrcMI) == DstRegs.end())) { 325 DstRegs.insert(DstMI); 326 return true; 327 } else 328 return false; 329 } 330 331 ClauseFile 332 MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 333 const { 334 MachineBasicBlock::iterator ClauseHead = I; 335 std::vector<MachineInstr *> ClauseContent; 336 unsigned AluInstCount = 0; 337 bool IsTex = TII->usesTextureCache(*ClauseHead); 338 std::set<unsigned> DstRegs; 339 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 340 if (IsTrivialInst(*I)) 341 continue; 342 if (AluInstCount >= MaxFetchInst) 343 break; 344 if ((IsTex && !TII->usesTextureCache(*I)) || 345 (!IsTex && !TII->usesVertexCache(*I))) 346 break; 347 if (!isCompatibleWithClause(*I, DstRegs)) 348 break; 349 AluInstCount ++; 350 ClauseContent.push_back(&*I); 351 } 352 MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), 353 getHWInstrDesc(IsTex?CF_TC:CF_VC)) 354 .addImm(0) // ADDR 355 .addImm(AluInstCount - 1); // COUNT 356 return ClauseFile(MIb, std::move(ClauseContent)); 357 } 358 359 void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const { 360 static const unsigned LiteralRegs[] = { 361 AMDGPU::ALU_LITERAL_X, 362 AMDGPU::ALU_LITERAL_Y, 363 AMDGPU::ALU_LITERAL_Z, 364 AMDGPU::ALU_LITERAL_W 365 }; 366 const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = 367 TII->getSrcs(MI); 368 for (const auto &Src:Srcs) { 369 if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X) 370 continue; 371 int64_t Imm = Src.second; 372 std::vector<MachineOperand *>::iterator It = 373 llvm::find_if(Lits, [&](MachineOperand *val) { 374 return val->isImm() && (val->getImm() == Imm); 375 }); 376 377 // Get corresponding Operand 378 MachineOperand &Operand = MI.getOperand( 379 TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); 380 381 if (It != Lits.end()) { 382 // Reuse existing literal reg 383 unsigned Index = It - Lits.begin(); 384 Src.first->setReg(LiteralRegs[Index]); 385 } else { 386 // Allocate new literal reg 387 assert(Lits.size() < 4 && "Too many literals in Instruction Group"); 388 Src.first->setReg(LiteralRegs[Lits.size()]); 389 Lits.push_back(&Operand); 390 } 391 } 392 } 393 394 MachineBasicBlock::iterator insertLiterals( 395 MachineBasicBlock::iterator InsertPos, 396 const std::vector<unsigned> &Literals) const { 397 MachineBasicBlock *MBB = InsertPos->getParent(); 398 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 399 unsigned LiteralPair0 = Literals[i]; 400 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; 401 InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), 402 TII->get(AMDGPU::LITERALS)) 403 .addImm(LiteralPair0) 404 .addImm(LiteralPair1); 405 } 406 return InsertPos; 407 } 408 409 ClauseFile 410 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 411 const { 412 MachineInstr &ClauseHead = *I; 413 std::vector<MachineInstr *> ClauseContent; 414 I++; 415 for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { 416 if (IsTrivialInst(*I)) { 417 ++I; 418 continue; 419 } 420 if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) 421 break; 422 std::vector<MachineOperand *>Literals; 423 if (I->isBundle()) { 424 MachineInstr &DeleteMI = *I; 425 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 426 while (++BI != E && BI->isBundledWithPred()) { 427 BI->unbundleFromPred(); 428 for (MachineOperand &MO : BI->operands()) { 429 if (MO.isReg() && MO.isInternalRead()) 430 MO.setIsInternalRead(false); 431 } 432 getLiteral(*BI, Literals); 433 ClauseContent.push_back(&*BI); 434 } 435 I = BI; 436 DeleteMI.eraseFromParent(); 437 } else { 438 getLiteral(*I, Literals); 439 ClauseContent.push_back(&*I); 440 I++; 441 } 442 for (unsigned i = 0, e = Literals.size(); i < e; i += 2) { 443 MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(), 444 TII->get(AMDGPU::LITERALS)); 445 if (Literals[i]->isImm()) { 446 MILit.addImm(Literals[i]->getImm()); 447 } else { 448 MILit.addGlobalAddress(Literals[i]->getGlobal(), 449 Literals[i]->getOffset()); 450 } 451 if (i + 1 < e) { 452 if (Literals[i + 1]->isImm()) { 453 MILit.addImm(Literals[i + 1]->getImm()); 454 } else { 455 MILit.addGlobalAddress(Literals[i + 1]->getGlobal(), 456 Literals[i + 1]->getOffset()); 457 } 458 } else 459 MILit.addImm(0); 460 ClauseContent.push_back(MILit); 461 } 462 } 463 assert(ClauseContent.size() < 128 && "ALU clause is too big"); 464 ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1); 465 return ClauseFile(&ClauseHead, std::move(ClauseContent)); 466 } 467 468 void EmitFetchClause(MachineBasicBlock::iterator InsertPos, 469 const DebugLoc &DL, ClauseFile &Clause, 470 unsigned &CfCount) { 471 CounterPropagateAddr(*Clause.first, CfCount); 472 MachineBasicBlock *BB = Clause.first->getParent(); 473 BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount); 474 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 475 BB->splice(InsertPos, BB, Clause.second[i]); 476 } 477 CfCount += 2 * Clause.second.size(); 478 } 479 480 void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL, 481 ClauseFile &Clause, unsigned &CfCount) { 482 Clause.first->getOperand(0).setImm(0); 483 CounterPropagateAddr(*Clause.first, CfCount); 484 MachineBasicBlock *BB = Clause.first->getParent(); 485 BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount); 486 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 487 BB->splice(InsertPos, BB, Clause.second[i]); 488 } 489 CfCount += Clause.second.size(); 490 } 491 492 void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const { 493 MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm()); 494 } 495 void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, 496 unsigned Addr) const { 497 for (MachineInstr *MI : MIs) { 498 CounterPropagateAddr(*MI, Addr); 499 } 500 } 501 502 public: 503 static char ID; 504 505 R600ControlFlowFinalizer() : MachineFunctionPass(ID) {} 506 507 bool runOnMachineFunction(MachineFunction &MF) override { 508 ST = &MF.getSubtarget<R600Subtarget>(); 509 MaxFetchInst = ST->getTexVTXClauseSize(); 510 TII = ST->getInstrInfo(); 511 TRI = ST->getRegisterInfo(); 512 513 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 514 515 CFStack CFStack(ST, MF.getFunction().getCallingConv()); 516 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; 517 ++MB) { 518 MachineBasicBlock &MBB = *MB; 519 unsigned CfCount = 0; 520 std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack; 521 std::vector<MachineInstr * > IfThenElseStack; 522 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_VS) { 523 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), 524 getHWInstrDesc(CF_CALL_FS)); 525 CfCount++; 526 } 527 std::vector<ClauseFile> FetchClauses, AluClauses; 528 std::vector<MachineInstr *> LastAlu(1); 529 std::vector<MachineInstr *> ToPopAfter; 530 531 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 532 I != E;) { 533 if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) { 534 DEBUG(dbgs() << CfCount << ":"; I->dump();); 535 FetchClauses.push_back(MakeFetchClause(MBB, I)); 536 CfCount++; 537 LastAlu.back() = nullptr; 538 continue; 539 } 540 541 MachineBasicBlock::iterator MI = I; 542 if (MI->getOpcode() != AMDGPU::ENDIF) 543 LastAlu.back() = nullptr; 544 if (MI->getOpcode() == AMDGPU::CF_ALU) 545 LastAlu.back() = &*MI; 546 I++; 547 bool RequiresWorkAround = 548 CFStack.requiresWorkAroundForInst(MI->getOpcode()); 549 switch (MI->getOpcode()) { 550 case AMDGPU::CF_ALU_PUSH_BEFORE: 551 if (RequiresWorkAround) { 552 DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); 553 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) 554 .addImm(CfCount + 1) 555 .addImm(1); 556 MI->setDesc(TII->get(AMDGPU::CF_ALU)); 557 CfCount++; 558 CFStack.pushBranch(AMDGPU::CF_PUSH_EG); 559 } else 560 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); 561 LLVM_FALLTHROUGH; 562 case AMDGPU::CF_ALU: 563 I = MI; 564 AluClauses.push_back(MakeALUClause(MBB, I)); 565 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 566 CfCount++; 567 break; 568 case AMDGPU::WHILELOOP: { 569 CFStack.pushLoop(); 570 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 571 getHWInstrDesc(CF_WHILE_LOOP)) 572 .addImm(1); 573 std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount, 574 std::set<MachineInstr *>()); 575 Pair.second.insert(MIb); 576 LoopStack.push_back(std::move(Pair)); 577 MI->eraseFromParent(); 578 CfCount++; 579 break; 580 } 581 case AMDGPU::ENDLOOP: { 582 CFStack.popLoop(); 583 std::pair<unsigned, std::set<MachineInstr *>> Pair = 584 std::move(LoopStack.back()); 585 LoopStack.pop_back(); 586 CounterPropagateAddr(Pair.second, CfCount); 587 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) 588 .addImm(Pair.first + 1); 589 MI->eraseFromParent(); 590 CfCount++; 591 break; 592 } 593 case AMDGPU::IF_PREDICATE_SET: { 594 LastAlu.push_back(nullptr); 595 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 596 getHWInstrDesc(CF_JUMP)) 597 .addImm(0) 598 .addImm(0); 599 IfThenElseStack.push_back(MIb); 600 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 601 MI->eraseFromParent(); 602 CfCount++; 603 break; 604 } 605 case AMDGPU::ELSE: { 606 MachineInstr * JumpInst = IfThenElseStack.back(); 607 IfThenElseStack.pop_back(); 608 CounterPropagateAddr(*JumpInst, CfCount); 609 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 610 getHWInstrDesc(CF_ELSE)) 611 .addImm(0) 612 .addImm(0); 613 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 614 IfThenElseStack.push_back(MIb); 615 MI->eraseFromParent(); 616 CfCount++; 617 break; 618 } 619 case AMDGPU::ENDIF: { 620 CFStack.popBranch(); 621 if (LastAlu.back()) { 622 ToPopAfter.push_back(LastAlu.back()); 623 } else { 624 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 625 getHWInstrDesc(CF_POP)) 626 .addImm(CfCount + 1) 627 .addImm(1); 628 (void)MIb; 629 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 630 CfCount++; 631 } 632 633 MachineInstr *IfOrElseInst = IfThenElseStack.back(); 634 IfThenElseStack.pop_back(); 635 CounterPropagateAddr(*IfOrElseInst, CfCount); 636 IfOrElseInst->getOperand(1).setImm(1); 637 LastAlu.pop_back(); 638 MI->eraseFromParent(); 639 break; 640 } 641 case AMDGPU::BREAK: { 642 CfCount ++; 643 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 644 getHWInstrDesc(CF_LOOP_BREAK)) 645 .addImm(0); 646 LoopStack.back().second.insert(MIb); 647 MI->eraseFromParent(); 648 break; 649 } 650 case AMDGPU::CONTINUE: { 651 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 652 getHWInstrDesc(CF_LOOP_CONTINUE)) 653 .addImm(0); 654 LoopStack.back().second.insert(MIb); 655 MI->eraseFromParent(); 656 CfCount++; 657 break; 658 } 659 case AMDGPU::RETURN: { 660 DebugLoc DL = MBB.findDebugLoc(MI); 661 BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END)); 662 CfCount++; 663 if (CfCount % 2) { 664 BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD)); 665 CfCount++; 666 } 667 MI->eraseFromParent(); 668 for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) 669 EmitFetchClause(I, DL, FetchClauses[i], CfCount); 670 for (unsigned i = 0, e = AluClauses.size(); i < e; i++) 671 EmitALUClause(I, DL, AluClauses[i], CfCount); 672 break; 673 } 674 default: 675 if (TII->isExport(MI->getOpcode())) { 676 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 677 CfCount++; 678 } 679 break; 680 } 681 } 682 for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { 683 MachineInstr *Alu = ToPopAfter[i]; 684 BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), 685 TII->get(AMDGPU::CF_ALU_POP_AFTER)) 686 .addImm(Alu->getOperand(0).getImm()) 687 .addImm(Alu->getOperand(1).getImm()) 688 .addImm(Alu->getOperand(2).getImm()) 689 .addImm(Alu->getOperand(3).getImm()) 690 .addImm(Alu->getOperand(4).getImm()) 691 .addImm(Alu->getOperand(5).getImm()) 692 .addImm(Alu->getOperand(6).getImm()) 693 .addImm(Alu->getOperand(7).getImm()) 694 .addImm(Alu->getOperand(8).getImm()); 695 Alu->eraseFromParent(); 696 } 697 MFI->CFStackSize = CFStack.MaxStackSize; 698 } 699 700 return false; 701 } 702 703 StringRef getPassName() const override { 704 return "R600 Control Flow Finalizer Pass"; 705 } 706 }; 707 708 } // end anonymous namespace 709 710 INITIALIZE_PASS_BEGIN(R600ControlFlowFinalizer, DEBUG_TYPE, 711 "R600 Control Flow Finalizer", false, false) 712 INITIALIZE_PASS_END(R600ControlFlowFinalizer, DEBUG_TYPE, 713 "R600 Control Flow Finalizer", false, false) 714 715 char R600ControlFlowFinalizer::ID = 0; 716 717 char &llvm::R600ControlFlowFinalizerID = R600ControlFlowFinalizer::ID; 718 719 FunctionPass *llvm::createR600ControlFlowFinalizer() { 720 return new R600ControlFlowFinalizer(); 721 } 722