1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass adds instructions to enable whole quad mode for pixel 11 /// shaders, and whole wavefront mode for all programs. 12 /// 13 /// Whole quad mode is required for derivative computations, but it interferes 14 /// with shader side effects (stores and atomics). It ensures that WQM is 15 /// enabled when necessary, but disabled around stores and atomics. 16 /// 17 /// When necessary, this pass creates a function prolog 18 /// 19 /// S_MOV_B64 LiveMask, EXEC 20 /// S_WQM_B64 EXEC, EXEC 21 /// 22 /// to enter WQM at the top of the function and surrounds blocks of Exact 23 /// instructions by 24 /// 25 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask 26 /// ... 27 /// S_MOV_B64 EXEC, Tmp 28 /// 29 /// We also compute when a sequence of instructions requires Whole Wavefront 30 /// Mode (WWM) and insert instructions to save and restore it: 31 /// 32 /// S_OR_SAVEEXEC_B64 Tmp, -1 33 /// ... 34 /// S_MOV_B64 EXEC, Tmp 35 /// 36 /// In order to avoid excessive switching during sequences of Exact 37 /// instructions, the pass first analyzes which instructions must be run in WQM 38 /// (aka which instructions produce values that lead to derivative 39 /// computations). 40 /// 41 /// Basic blocks are always exited in WQM as long as some successor needs WQM. 42 /// 43 /// There is room for improvement given better control flow analysis: 44 /// 45 /// (1) at the top level (outside of control flow statements, and as long as 46 /// kill hasn't been used), one SGPR can be saved by recovering WQM from 47 /// the LiveMask (this is implemented for the entry block). 48 /// 49 /// (2) when entire regions (e.g. if-else blocks or entire loops) only 50 /// consist of exact and don't-care instructions, the switch only has to 51 /// be done at the entry and exit points rather than potentially in each 52 /// block of the region. 53 /// 54 //===----------------------------------------------------------------------===// 55 56 #include "AMDGPU.h" 57 #include "GCNSubtarget.h" 58 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 59 #include "llvm/ADT/MapVector.h" 60 #include "llvm/ADT/PostOrderIterator.h" 61 #include "llvm/CodeGen/LiveIntervals.h" 62 #include "llvm/CodeGen/MachineBasicBlock.h" 63 #include "llvm/CodeGen/MachineDominators.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/CodeGen/MachineInstr.h" 66 #include "llvm/CodeGen/MachinePostDominators.h" 67 #include "llvm/IR/CallingConv.h" 68 #include "llvm/InitializePasses.h" 69 #include "llvm/Support/raw_ostream.h" 70 71 using namespace llvm; 72 73 #define DEBUG_TYPE "si-wqm" 74 75 namespace { 76 77 enum { 78 StateWQM = 0x1, 79 StateWWM = 0x2, 80 StateExact = 0x4, 81 }; 82 83 struct PrintState { 84 public: 85 int State; 86 87 explicit PrintState(int State) : State(State) {} 88 }; 89 90 #ifndef NDEBUG 91 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { 92 if (PS.State & StateWQM) 93 OS << "WQM"; 94 if (PS.State & StateWWM) { 95 if (PS.State & StateWQM) 96 OS << '|'; 97 OS << "WWM"; 98 } 99 if (PS.State & StateExact) { 100 if (PS.State & (StateWQM | StateWWM)) 101 OS << '|'; 102 OS << "Exact"; 103 } 104 105 return OS; 106 } 107 #endif 108 109 struct InstrInfo { 110 char Needs = 0; 111 char Disabled = 0; 112 char OutNeeds = 0; 113 }; 114 115 struct BlockInfo { 116 char Needs = 0; 117 char InNeeds = 0; 118 char OutNeeds = 0; 119 char InitialState = 0; 120 bool NeedsLowering = false; 121 }; 122 123 struct WorkItem { 124 MachineBasicBlock *MBB = nullptr; 125 MachineInstr *MI = nullptr; 126 127 WorkItem() = default; 128 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} 129 WorkItem(MachineInstr *MI) : MI(MI) {} 130 }; 131 132 class SIWholeQuadMode : public MachineFunctionPass { 133 private: 134 const SIInstrInfo *TII; 135 const SIRegisterInfo *TRI; 136 const GCNSubtarget *ST; 137 MachineRegisterInfo *MRI; 138 LiveIntervals *LIS; 139 MachineDominatorTree *MDT; 140 MachinePostDominatorTree *PDT; 141 142 unsigned AndOpc; 143 unsigned AndN2Opc; 144 unsigned XorOpc; 145 unsigned AndSaveExecOpc; 146 unsigned OrSaveExecOpc; 147 unsigned WQMOpc; 148 Register Exec; 149 Register LiveMaskReg; 150 151 DenseMap<const MachineInstr *, InstrInfo> Instructions; 152 MapVector<MachineBasicBlock *, BlockInfo> Blocks; 153 154 // Tracks state (WQM/WWM/Exact) after a given instruction 155 DenseMap<const MachineInstr *, char> StateTransition; 156 157 SmallVector<MachineInstr *, 2> LiveMaskQueries; 158 SmallVector<MachineInstr *, 4> LowerToMovInstrs; 159 SmallVector<MachineInstr *, 4> LowerToCopyInstrs; 160 SmallVector<MachineInstr *, 4> KillInstrs; 161 162 void printInfo(); 163 164 void markInstruction(MachineInstr &MI, char Flag, 165 std::vector<WorkItem> &Worklist); 166 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, 167 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); 168 void markInstructionUses(const MachineInstr &MI, char Flag, 169 std::vector<WorkItem> &Worklist); 170 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); 171 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); 172 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); 173 char analyzeFunction(MachineFunction &MF); 174 175 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, 176 MachineBasicBlock::iterator Before); 177 MachineBasicBlock::iterator 178 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 179 MachineBasicBlock::iterator Last, bool PreferLast, 180 bool SaveSCC); 181 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 182 Register SaveWQM); 183 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 184 Register SavedWQM); 185 void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 186 Register SaveOrig); 187 void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, 188 Register SavedOrig, char NonWWMState); 189 190 MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); 191 192 MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, 193 bool IsWQM); 194 MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); 195 196 void lowerBlock(MachineBasicBlock &MBB); 197 void processBlock(MachineBasicBlock &MBB, bool IsEntry); 198 199 void lowerLiveMaskQueries(); 200 void lowerCopyInstrs(); 201 void lowerKillInstrs(bool IsWQM); 202 203 public: 204 static char ID; 205 206 SIWholeQuadMode() : 207 MachineFunctionPass(ID) { } 208 209 bool runOnMachineFunction(MachineFunction &MF) override; 210 211 StringRef getPassName() const override { return "SI Whole Quad Mode"; } 212 213 void getAnalysisUsage(AnalysisUsage &AU) const override { 214 AU.addRequired<LiveIntervals>(); 215 AU.addPreserved<SlotIndexes>(); 216 AU.addPreserved<LiveIntervals>(); 217 AU.addRequired<MachineDominatorTree>(); 218 AU.addPreserved<MachineDominatorTree>(); 219 AU.addRequired<MachinePostDominatorTree>(); 220 AU.addPreserved<MachinePostDominatorTree>(); 221 MachineFunctionPass::getAnalysisUsage(AU); 222 } 223 224 MachineFunctionProperties getClearedProperties() const override { 225 return MachineFunctionProperties().set( 226 MachineFunctionProperties::Property::IsSSA); 227 } 228 }; 229 230 } // end anonymous namespace 231 232 char SIWholeQuadMode::ID = 0; 233 234 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 235 false) 236 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 237 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 238 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 239 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, 240 false) 241 242 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; 243 244 FunctionPass *llvm::createSIWholeQuadModePass() { 245 return new SIWholeQuadMode; 246 } 247 248 #ifndef NDEBUG 249 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { 250 for (const auto &BII : Blocks) { 251 dbgs() << "\n" 252 << printMBBReference(*BII.first) << ":\n" 253 << " InNeeds = " << PrintState(BII.second.InNeeds) 254 << ", Needs = " << PrintState(BII.second.Needs) 255 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; 256 257 for (const MachineInstr &MI : *BII.first) { 258 auto III = Instructions.find(&MI); 259 if (III == Instructions.end()) 260 continue; 261 262 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) 263 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; 264 } 265 } 266 } 267 #endif 268 269 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, 270 std::vector<WorkItem> &Worklist) { 271 InstrInfo &II = Instructions[&MI]; 272 273 assert(!(Flag & StateExact) && Flag != 0); 274 275 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); 276 277 // Remove any disabled states from the flag. The user that required it gets 278 // an undefined value in the helper lanes. For example, this can happen if 279 // the result of an atomic is used by instruction that requires WQM, where 280 // ignoring the request for WQM is correct as per the relevant specs. 281 Flag &= ~II.Disabled; 282 283 // Ignore if the flag is already encompassed by the existing needs, or we 284 // just disabled everything. 285 if ((II.Needs & Flag) == Flag) 286 return; 287 288 II.Needs |= Flag; 289 Worklist.push_back(&MI); 290 } 291 292 /// Mark all relevant definitions of register \p Reg in usage \p UseMI. 293 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, 294 Register Reg, unsigned SubReg, char Flag, 295 std::vector<WorkItem> &Worklist) { 296 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); 297 298 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); 299 if (!UseLRQ.valueIn()) 300 return; 301 302 SmallPtrSet<const VNInfo *, 4> Visited; 303 SmallVector<const VNInfo *, 4> ToProcess; 304 ToProcess.push_back(UseLRQ.valueIn()); 305 do { 306 const VNInfo *Value = ToProcess.pop_back_val(); 307 Visited.insert(Value); 308 309 if (Value->isPHIDef()) { 310 // Need to mark all defs used in the PHI node 311 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); 312 assert(MBB && "Phi-def has no defining MBB"); 313 for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), 314 PE = MBB->pred_end(); 315 PI != PE; ++PI) { 316 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) { 317 if (!Visited.count(VN)) 318 ToProcess.push_back(VN); 319 } 320 } 321 } else { 322 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); 323 assert(MI && "Def has no defining instruction"); 324 markInstruction(*MI, Flag, Worklist); 325 326 // Iterate over all operands to find relevant definitions 327 for (const MachineOperand &Op : MI->operands()) { 328 if (!(Op.isReg() && Op.getReg() == Reg)) 329 continue; 330 331 // Does this def cover whole register? 332 bool DefinesFullReg = 333 Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg; 334 if (!DefinesFullReg) { 335 // Partial definition; need to follow and mark input value 336 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); 337 if (const VNInfo *VN = LRQ.valueIn()) { 338 if (!Visited.count(VN)) 339 ToProcess.push_back(VN); 340 } 341 } 342 } 343 } 344 } while (!ToProcess.empty()); 345 } 346 347 /// Mark all instructions defining the uses in \p MI with \p Flag. 348 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, 349 std::vector<WorkItem> &Worklist) { 350 351 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " 352 << MI); 353 354 for (const MachineOperand &Use : MI.uses()) { 355 if (!Use.isReg() || !Use.isUse()) 356 continue; 357 358 Register Reg = Use.getReg(); 359 360 // Handle physical registers that we need to track; this is mostly relevant 361 // for VCC, which can appear as the (implicit) input of a uniform branch, 362 // e.g. when a loop counter is stored in a VGPR. 363 if (!Reg.isVirtual()) { 364 if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) 365 continue; 366 367 for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid(); 368 ++RegUnit) { 369 LiveRange &LR = LIS->getRegUnit(*RegUnit); 370 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); 371 if (!Value) 372 continue; 373 374 markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); 375 } 376 377 continue; 378 } 379 380 LiveRange &LR = LIS->getInterval(Reg); 381 markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); 382 } 383 } 384 385 // Scan instructions to determine which ones require an Exact execmask and 386 // which ones seed WQM requirements. 387 char SIWholeQuadMode::scanInstructions(MachineFunction &MF, 388 std::vector<WorkItem> &Worklist) { 389 char GlobalFlags = 0; 390 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); 391 SmallVector<MachineInstr *, 4> SetInactiveInstrs; 392 SmallVector<MachineInstr *, 4> SoftWQMInstrs; 393 394 // We need to visit the basic blocks in reverse post-order so that we visit 395 // defs before uses, in particular so that we don't accidentally mark an 396 // instruction as needing e.g. WQM before visiting it and realizing it needs 397 // WQM disabled. 398 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 399 for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { 400 MachineBasicBlock &MBB = **BI; 401 BlockInfo &BBI = Blocks[&MBB]; 402 403 for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { 404 MachineInstr &MI = *II; 405 InstrInfo &III = Instructions[&MI]; 406 unsigned Opcode = MI.getOpcode(); 407 char Flags = 0; 408 409 if (TII->isWQM(Opcode)) { 410 // Sampling instructions don't need to produce results for all pixels 411 // in a quad, they just require all inputs of a quad to have been 412 // computed for derivatives. 413 markInstructionUses(MI, StateWQM, Worklist); 414 GlobalFlags |= StateWQM; 415 continue; 416 } else if (Opcode == AMDGPU::WQM) { 417 // The WQM intrinsic requires its output to have all the helper lanes 418 // correct, so we need it to be in WQM. 419 Flags = StateWQM; 420 LowerToCopyInstrs.push_back(&MI); 421 } else if (Opcode == AMDGPU::SOFT_WQM) { 422 LowerToCopyInstrs.push_back(&MI); 423 SoftWQMInstrs.push_back(&MI); 424 continue; 425 } else if (Opcode == AMDGPU::WWM) { 426 // The WWM intrinsic doesn't make the same guarantee, and plus it needs 427 // to be executed in WQM or Exact so that its copy doesn't clobber 428 // inactive lanes. 429 markInstructionUses(MI, StateWWM, Worklist); 430 GlobalFlags |= StateWWM; 431 LowerToMovInstrs.push_back(&MI); 432 continue; 433 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || 434 Opcode == AMDGPU::V_SET_INACTIVE_B64) { 435 III.Disabled = StateWWM; 436 MachineOperand &Inactive = MI.getOperand(2); 437 if (Inactive.isReg()) { 438 if (Inactive.isUndef()) { 439 LowerToCopyInstrs.push_back(&MI); 440 } else { 441 Register Reg = Inactive.getReg(); 442 if (Reg.isVirtual()) { 443 for (MachineInstr &DefMI : MRI->def_instructions(Reg)) 444 markInstruction(DefMI, StateWWM, Worklist); 445 } 446 } 447 } 448 SetInactiveInstrs.push_back(&MI); 449 continue; 450 } else if (TII->isDisableWQM(MI)) { 451 BBI.Needs |= StateExact; 452 if (!(BBI.InNeeds & StateExact)) { 453 BBI.InNeeds |= StateExact; 454 Worklist.push_back(&MBB); 455 } 456 GlobalFlags |= StateExact; 457 III.Disabled = StateWQM | StateWWM; 458 continue; 459 } else { 460 if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { 461 LiveMaskQueries.push_back(&MI); 462 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || 463 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || 464 Opcode == AMDGPU::SI_DEMOTE_I1) { 465 KillInstrs.push_back(&MI); 466 BBI.NeedsLowering = true; 467 } else if (WQMOutputs) { 468 // The function is in machine SSA form, which means that physical 469 // VGPRs correspond to shader inputs and outputs. Inputs are 470 // only used, outputs are only defined. 471 // FIXME: is this still valid? 472 for (const MachineOperand &MO : MI.defs()) { 473 if (!MO.isReg()) 474 continue; 475 476 Register Reg = MO.getReg(); 477 478 if (!Reg.isVirtual() && 479 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) { 480 Flags = StateWQM; 481 break; 482 } 483 } 484 } 485 486 if (!Flags) 487 continue; 488 } 489 490 markInstruction(MI, Flags, Worklist); 491 GlobalFlags |= Flags; 492 } 493 } 494 495 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is 496 // ever used anywhere in the function. This implements the corresponding 497 // semantics of @llvm.amdgcn.set.inactive. 498 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. 499 if (GlobalFlags & StateWQM) { 500 for (MachineInstr *MI : SetInactiveInstrs) 501 markInstruction(*MI, StateWQM, Worklist); 502 for (MachineInstr *MI : SoftWQMInstrs) 503 markInstruction(*MI, StateWQM, Worklist); 504 } 505 506 return GlobalFlags; 507 } 508 509 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, 510 std::vector<WorkItem>& Worklist) { 511 MachineBasicBlock *MBB = MI.getParent(); 512 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references 513 BlockInfo &BI = Blocks[MBB]; 514 515 // Control flow-type instructions and stores to temporary memory that are 516 // followed by WQM computations must themselves be in WQM. 517 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && 518 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { 519 Instructions[&MI].Needs = StateWQM; 520 II.Needs = StateWQM; 521 } 522 523 // Propagate to block level 524 if (II.Needs & StateWQM) { 525 BI.Needs |= StateWQM; 526 if (!(BI.InNeeds & StateWQM)) { 527 BI.InNeeds |= StateWQM; 528 Worklist.push_back(MBB); 529 } 530 } 531 532 // Propagate backwards within block 533 if (MachineInstr *PrevMI = MI.getPrevNode()) { 534 char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; 535 if (!PrevMI->isPHI()) { 536 InstrInfo &PrevII = Instructions[PrevMI]; 537 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { 538 PrevII.OutNeeds |= InNeeds; 539 Worklist.push_back(PrevMI); 540 } 541 } 542 } 543 544 // Propagate WQM flag to instruction inputs 545 assert(!(II.Needs & StateExact)); 546 547 if (II.Needs != 0) 548 markInstructionUses(MI, II.Needs, Worklist); 549 550 // Ensure we process a block containing WWM, even if it does not require any 551 // WQM transitions. 552 if (II.Needs & StateWWM) 553 BI.Needs |= StateWWM; 554 } 555 556 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, 557 std::vector<WorkItem>& Worklist) { 558 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. 559 560 // Propagate through instructions 561 if (!MBB.empty()) { 562 MachineInstr *LastMI = &*MBB.rbegin(); 563 InstrInfo &LastII = Instructions[LastMI]; 564 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { 565 LastII.OutNeeds |= BI.OutNeeds; 566 Worklist.push_back(LastMI); 567 } 568 } 569 570 // Predecessor blocks must provide for our WQM/Exact needs. 571 for (MachineBasicBlock *Pred : MBB.predecessors()) { 572 BlockInfo &PredBI = Blocks[Pred]; 573 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) 574 continue; 575 576 PredBI.OutNeeds |= BI.InNeeds; 577 PredBI.InNeeds |= BI.InNeeds; 578 Worklist.push_back(Pred); 579 } 580 581 // All successors must be prepared to accept the same set of WQM/Exact data. 582 for (MachineBasicBlock *Succ : MBB.successors()) { 583 BlockInfo &SuccBI = Blocks[Succ]; 584 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) 585 continue; 586 587 SuccBI.InNeeds |= BI.OutNeeds; 588 Worklist.push_back(Succ); 589 } 590 } 591 592 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { 593 std::vector<WorkItem> Worklist; 594 char GlobalFlags = scanInstructions(MF, Worklist); 595 596 while (!Worklist.empty()) { 597 WorkItem WI = Worklist.back(); 598 Worklist.pop_back(); 599 600 if (WI.MI) 601 propagateInstruction(*WI.MI, Worklist); 602 else 603 propagateBlock(*WI.MBB, Worklist); 604 } 605 606 return GlobalFlags; 607 } 608 609 MachineBasicBlock::iterator 610 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, 611 MachineBasicBlock::iterator Before) { 612 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 613 614 MachineInstr *Save = 615 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) 616 .addReg(AMDGPU::SCC); 617 MachineInstr *Restore = 618 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) 619 .addReg(SaveReg); 620 621 LIS->InsertMachineInstrInMaps(*Save); 622 LIS->InsertMachineInstrInMaps(*Restore); 623 LIS->createAndComputeVirtRegInterval(SaveReg); 624 625 return Restore; 626 } 627 628 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, 629 MachineInstr *TermMI) { 630 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " 631 << *TermMI << "\n"); 632 633 MachineBasicBlock *SplitBB = 634 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS); 635 636 // Convert last instruction in block to a terminator. 637 // Note: this only covers the expected patterns 638 unsigned NewOpcode = 0; 639 switch (TermMI->getOpcode()) { 640 case AMDGPU::S_AND_B32: 641 NewOpcode = AMDGPU::S_AND_B32_term; 642 break; 643 case AMDGPU::S_AND_B64: 644 NewOpcode = AMDGPU::S_AND_B64_term; 645 break; 646 case AMDGPU::S_MOV_B32: 647 NewOpcode = AMDGPU::S_MOV_B32_term; 648 break; 649 case AMDGPU::S_MOV_B64: 650 NewOpcode = AMDGPU::S_MOV_B64_term; 651 break; 652 default: 653 break; 654 } 655 if (NewOpcode) 656 TermMI->setDesc(TII->get(NewOpcode)); 657 658 if (SplitBB != BB) { 659 // Update dominator trees 660 using DomTreeT = DomTreeBase<MachineBasicBlock>; 661 SmallVector<DomTreeT::UpdateType, 16> DTUpdates; 662 for (MachineBasicBlock *Succ : SplitBB->successors()) { 663 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); 664 DTUpdates.push_back({DomTreeT::Delete, BB, Succ}); 665 } 666 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB}); 667 if (MDT) 668 MDT->getBase().applyUpdates(DTUpdates); 669 if (PDT) 670 PDT->getBase().applyUpdates(DTUpdates); 671 672 // Link blocks 673 MachineInstr *MI = 674 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) 675 .addMBB(SplitBB); 676 LIS->InsertMachineInstrInMaps(*MI); 677 } 678 679 return SplitBB; 680 } 681 682 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, 683 MachineInstr &MI) { 684 const DebugLoc &DL = MI.getDebugLoc(); 685 unsigned Opcode = 0; 686 687 assert(MI.getOperand(0).isReg()); 688 689 // Comparison is for live lanes; however here we compute the inverse 690 // (killed lanes). This is because VCMP will always generate 0 bits 691 // for inactive lanes so a mask of live lanes would not be correct 692 // inside control flow. 693 // Invert the comparison by swapping the operands and adjusting 694 // the comparison codes. 695 696 switch (MI.getOperand(2).getImm()) { 697 case ISD::SETUEQ: 698 Opcode = AMDGPU::V_CMP_LG_F32_e64; 699 break; 700 case ISD::SETUGT: 701 Opcode = AMDGPU::V_CMP_GE_F32_e64; 702 break; 703 case ISD::SETUGE: 704 Opcode = AMDGPU::V_CMP_GT_F32_e64; 705 break; 706 case ISD::SETULT: 707 Opcode = AMDGPU::V_CMP_LE_F32_e64; 708 break; 709 case ISD::SETULE: 710 Opcode = AMDGPU::V_CMP_LT_F32_e64; 711 break; 712 case ISD::SETUNE: 713 Opcode = AMDGPU::V_CMP_EQ_F32_e64; 714 break; 715 case ISD::SETO: 716 Opcode = AMDGPU::V_CMP_O_F32_e64; 717 break; 718 case ISD::SETUO: 719 Opcode = AMDGPU::V_CMP_U_F32_e64; 720 break; 721 case ISD::SETOEQ: 722 case ISD::SETEQ: 723 Opcode = AMDGPU::V_CMP_NEQ_F32_e64; 724 break; 725 case ISD::SETOGT: 726 case ISD::SETGT: 727 Opcode = AMDGPU::V_CMP_NLT_F32_e64; 728 break; 729 case ISD::SETOGE: 730 case ISD::SETGE: 731 Opcode = AMDGPU::V_CMP_NLE_F32_e64; 732 break; 733 case ISD::SETOLT: 734 case ISD::SETLT: 735 Opcode = AMDGPU::V_CMP_NGT_F32_e64; 736 break; 737 case ISD::SETOLE: 738 case ISD::SETLE: 739 Opcode = AMDGPU::V_CMP_NGE_F32_e64; 740 break; 741 case ISD::SETONE: 742 case ISD::SETNE: 743 Opcode = AMDGPU::V_CMP_NLG_F32_e64; 744 break; 745 default: 746 llvm_unreachable("invalid ISD:SET cond code"); 747 } 748 749 // Pick opcode based on comparison type. 750 MachineInstr *VcmpMI; 751 const MachineOperand &Op0 = MI.getOperand(0); 752 const MachineOperand &Op1 = MI.getOperand(1); 753 if (TRI->isVGPR(*MRI, Op0.getReg())) { 754 Opcode = AMDGPU::getVOPe32(Opcode); 755 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); 756 } else { 757 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) 758 .addReg(AMDGPU::VCC, RegState::Define) 759 .addImm(0) // src0 modifiers 760 .add(Op1) 761 .addImm(0) // src1 modifiers 762 .add(Op0) 763 .addImm(0); // omod 764 } 765 766 // VCC represents lanes killed. 767 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 768 769 MachineInstr *MaskUpdateMI = 770 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 771 .addReg(LiveMaskReg) 772 .addReg(VCC); 773 774 // State of SCC represents whether any lanes are live in mask, 775 // if SCC is 0 then no lanes will be alive anymore. 776 MachineInstr *EarlyTermMI = 777 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); 778 779 MachineInstr *ExecMaskMI = 780 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); 781 782 assert(MBB.succ_size() == 1); 783 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) 784 .addMBB(*MBB.succ_begin()); 785 786 // Update live intervals 787 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI); 788 MBB.remove(&MI); 789 790 LIS->InsertMachineInstrInMaps(*MaskUpdateMI); 791 LIS->InsertMachineInstrInMaps(*ExecMaskMI); 792 LIS->InsertMachineInstrInMaps(*EarlyTermMI); 793 LIS->InsertMachineInstrInMaps(*NewTerm); 794 795 return NewTerm; 796 } 797 798 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, 799 MachineInstr &MI, bool IsWQM) { 800 const DebugLoc &DL = MI.getDebugLoc(); 801 MachineInstr *MaskUpdateMI = nullptr; 802 803 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); 804 const MachineOperand &Op = MI.getOperand(0); 805 int64_t KillVal = MI.getOperand(1).getImm(); 806 MachineInstr *ComputeKilledMaskMI = nullptr; 807 Register CndReg = !Op.isImm() ? Op.getReg() : Register(); 808 Register TmpReg; 809 810 // Is this a static or dynamic kill? 811 if (Op.isImm()) { 812 if (Op.getImm() == KillVal) { 813 // Static: all active lanes are killed 814 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 815 .addReg(LiveMaskReg) 816 .addReg(Exec); 817 } else { 818 // Static: kill does nothing 819 MachineInstr *NewTerm = nullptr; 820 if (IsDemote) { 821 LIS->RemoveMachineInstrFromMaps(MI); 822 } else { 823 assert(MBB.succ_size() == 1); 824 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) 825 .addMBB(*MBB.succ_begin()); 826 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); 827 } 828 MBB.remove(&MI); 829 return NewTerm; 830 } 831 } else { 832 if (!KillVal) { 833 // Op represents live lanes after kill, 834 // so exec mask needs to be factored in. 835 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC()); 836 ComputeKilledMaskMI = 837 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec); 838 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 839 .addReg(LiveMaskReg) 840 .addReg(TmpReg); 841 } else { 842 // Op represents lanes to kill 843 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) 844 .addReg(LiveMaskReg) 845 .add(Op); 846 } 847 } 848 849 // State of SCC represents whether any lanes are live in mask, 850 // if SCC is 0 then no lanes will be alive anymore. 851 MachineInstr *EarlyTermMI = 852 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); 853 854 // In the case we got this far some lanes are still live, 855 // update EXEC to deactivate lanes as appropriate. 856 MachineInstr *NewTerm; 857 MachineInstr *WQMMaskMI = nullptr; 858 Register LiveMaskWQM; 859 if (IsDemote) { 860 // Demotes deactive quads with only helper lanes 861 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); 862 WQMMaskMI = 863 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); 864 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) 865 .addReg(Exec) 866 .addReg(LiveMaskWQM); 867 } else { 868 // Kills deactivate lanes 869 if (Op.isImm()) { 870 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 871 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); 872 } else if (!IsWQM) { 873 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) 874 .addReg(Exec) 875 .addReg(LiveMaskReg); 876 } else { 877 unsigned Opcode = KillVal ? AndN2Opc : AndOpc; 878 NewTerm = 879 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); 880 } 881 } 882 883 // Update live intervals 884 LIS->RemoveMachineInstrFromMaps(MI); 885 MBB.remove(&MI); 886 assert(EarlyTermMI); 887 assert(MaskUpdateMI); 888 assert(NewTerm); 889 if (ComputeKilledMaskMI) 890 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); 891 LIS->InsertMachineInstrInMaps(*MaskUpdateMI); 892 LIS->InsertMachineInstrInMaps(*EarlyTermMI); 893 if (WQMMaskMI) 894 LIS->InsertMachineInstrInMaps(*WQMMaskMI); 895 LIS->InsertMachineInstrInMaps(*NewTerm); 896 897 if (CndReg) { 898 LIS->removeInterval(CndReg); 899 LIS->createAndComputeVirtRegInterval(CndReg); 900 } 901 if (TmpReg) 902 LIS->createAndComputeVirtRegInterval(TmpReg); 903 if (LiveMaskWQM) 904 LIS->createAndComputeVirtRegInterval(LiveMaskWQM); 905 906 return NewTerm; 907 } 908 909 // Replace (or supplement) instructions accessing live mask. 910 // This can only happen once all the live mask registers have been created 911 // and the execute state (WQM/WWM/Exact) of instructions is known. 912 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { 913 auto BII = Blocks.find(&MBB); 914 if (BII == Blocks.end()) 915 return; 916 917 const BlockInfo &BI = BII->second; 918 if (!BI.NeedsLowering) 919 return; 920 921 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); 922 923 SmallVector<MachineInstr *, 4> SplitPoints; 924 char State = BI.InitialState; 925 926 auto II = MBB.getFirstNonPHI(), IE = MBB.end(); 927 while (II != IE) { 928 auto Next = std::next(II); 929 MachineInstr &MI = *II; 930 931 if (StateTransition.count(&MI)) 932 State = StateTransition[&MI]; 933 934 MachineInstr *SplitPoint = nullptr; 935 switch (MI.getOpcode()) { 936 case AMDGPU::SI_DEMOTE_I1: 937 case AMDGPU::SI_KILL_I1_TERMINATOR: 938 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); 939 break; 940 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 941 SplitPoint = lowerKillF32(MBB, MI); 942 break; 943 default: 944 break; 945 } 946 if (SplitPoint) 947 SplitPoints.push_back(SplitPoint); 948 949 II = Next; 950 } 951 952 // Perform splitting after instruction scan to simplify iteration. 953 if (!SplitPoints.empty()) { 954 MachineBasicBlock *BB = &MBB; 955 for (MachineInstr *MI : SplitPoints) { 956 BB = splitBlock(BB, MI); 957 } 958 } 959 } 960 961 // Return an iterator in the (inclusive) range [First, Last] at which 962 // instructions can be safely inserted, keeping in mind that some of the 963 // instructions we want to add necessarily clobber SCC. 964 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( 965 MachineBasicBlock &MBB, MachineBasicBlock::iterator First, 966 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { 967 if (!SaveSCC) 968 return PreferLast ? Last : First; 969 970 LiveRange &LR = 971 LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); 972 auto MBBE = MBB.end(); 973 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) 974 : LIS->getMBBEndIdx(&MBB); 975 SlotIndex LastIdx = 976 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); 977 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; 978 const LiveRange::Segment *S; 979 980 for (;;) { 981 S = LR.getSegmentContaining(Idx); 982 if (!S) 983 break; 984 985 if (PreferLast) { 986 SlotIndex Next = S->start.getBaseIndex(); 987 if (Next < FirstIdx) 988 break; 989 Idx = Next; 990 } else { 991 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex()); 992 assert(EndMI && "Segment does not end on valid instruction"); 993 auto NextI = std::next(EndMI->getIterator()); 994 if (NextI == MBB.end()) 995 break; 996 SlotIndex Next = LIS->getInstructionIndex(*NextI); 997 if (Next > LastIdx) 998 break; 999 Idx = Next; 1000 } 1001 } 1002 1003 MachineBasicBlock::iterator MBBI; 1004 1005 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) 1006 MBBI = MI; 1007 else { 1008 assert(Idx == LIS->getMBBEndIdx(&MBB)); 1009 MBBI = MBB.end(); 1010 } 1011 1012 // Move insertion point past any operations modifying EXEC. 1013 // This assumes that the value of SCC defined by any of these operations 1014 // does not need to be preserved. 1015 while (MBBI != Last) { 1016 bool IsExecDef = false; 1017 for (const MachineOperand &MO : MBBI->operands()) { 1018 if (MO.isReg() && MO.isDef()) { 1019 IsExecDef |= 1020 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; 1021 } 1022 } 1023 if (!IsExecDef) 1024 break; 1025 MBBI++; 1026 S = nullptr; 1027 } 1028 1029 if (S) 1030 MBBI = saveSCC(MBB, MBBI); 1031 1032 return MBBI; 1033 } 1034 1035 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, 1036 MachineBasicBlock::iterator Before, 1037 Register SaveWQM) { 1038 MachineInstr *MI; 1039 1040 if (SaveWQM) { 1041 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM) 1042 .addReg(LiveMaskReg); 1043 } else { 1044 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec) 1045 .addReg(Exec) 1046 .addReg(LiveMaskReg); 1047 } 1048 1049 LIS->InsertMachineInstrInMaps(*MI); 1050 StateTransition[MI] = StateExact; 1051 } 1052 1053 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, 1054 MachineBasicBlock::iterator Before, 1055 Register SavedWQM) { 1056 MachineInstr *MI; 1057 1058 if (SavedWQM) { 1059 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) 1060 .addReg(SavedWQM); 1061 } else { 1062 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); 1063 } 1064 1065 LIS->InsertMachineInstrInMaps(*MI); 1066 StateTransition[MI] = StateWQM; 1067 } 1068 1069 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, 1070 MachineBasicBlock::iterator Before, 1071 Register SaveOrig) { 1072 MachineInstr *MI; 1073 1074 assert(SaveOrig); 1075 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) 1076 .addImm(-1); 1077 LIS->InsertMachineInstrInMaps(*MI); 1078 StateTransition[MI] = StateWWM; 1079 } 1080 1081 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, 1082 MachineBasicBlock::iterator Before, 1083 Register SavedOrig, char NonWWMState) { 1084 MachineInstr *MI; 1085 1086 assert(SavedOrig); 1087 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec) 1088 .addReg(SavedOrig); 1089 LIS->InsertMachineInstrInMaps(*MI); 1090 StateTransition[MI] = NonWWMState; 1091 } 1092 1093 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { 1094 auto BII = Blocks.find(&MBB); 1095 if (BII == Blocks.end()) 1096 return; 1097 1098 BlockInfo &BI = BII->second; 1099 1100 // This is a non-entry block that is WQM throughout, so no need to do 1101 // anything. 1102 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { 1103 BI.InitialState = StateWQM; 1104 return; 1105 } 1106 1107 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) 1108 << ":\n"); 1109 1110 Register SavedWQMReg; 1111 Register SavedNonWWMReg; 1112 bool WQMFromExec = IsEntry; 1113 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; 1114 char NonWWMState = 0; 1115 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 1116 1117 auto II = MBB.getFirstNonPHI(), IE = MBB.end(); 1118 if (IsEntry) { 1119 // Skip the instruction that saves LiveMask 1120 if (II != IE && II->getOpcode() == AMDGPU::COPY) 1121 ++II; 1122 } 1123 1124 // This stores the first instruction where it's safe to switch from WQM to 1125 // Exact or vice versa. 1126 MachineBasicBlock::iterator FirstWQM = IE; 1127 1128 // This stores the first instruction where it's safe to switch from WWM to 1129 // Exact/WQM or to switch to WWM. It must always be the same as, or after, 1130 // FirstWQM since if it's safe to switch to/from WWM, it must be safe to 1131 // switch to/from WQM as well. 1132 MachineBasicBlock::iterator FirstWWM = IE; 1133 1134 // Record initial state is block information. 1135 BI.InitialState = State; 1136 1137 for (;;) { 1138 MachineBasicBlock::iterator Next = II; 1139 char Needs = StateExact | StateWQM; // WWM is disabled by default 1140 char OutNeeds = 0; 1141 1142 if (FirstWQM == IE) 1143 FirstWQM = II; 1144 1145 if (FirstWWM == IE) 1146 FirstWWM = II; 1147 1148 // First, figure out the allowed states (Needs) based on the propagated 1149 // flags. 1150 if (II != IE) { 1151 MachineInstr &MI = *II; 1152 1153 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) { 1154 auto III = Instructions.find(&MI); 1155 if (III != Instructions.end()) { 1156 if (III->second.Needs & StateWWM) 1157 Needs = StateWWM; 1158 else if (III->second.Needs & StateWQM) 1159 Needs = StateWQM; 1160 else 1161 Needs &= ~III->second.Disabled; 1162 OutNeeds = III->second.OutNeeds; 1163 } 1164 } else { 1165 // If the instruction doesn't actually need a correct EXEC, then we can 1166 // safely leave WWM enabled. 1167 Needs = StateExact | StateWQM | StateWWM; 1168 } 1169 1170 if (MI.isTerminator() && OutNeeds == StateExact) 1171 Needs = StateExact; 1172 1173 ++Next; 1174 } else { 1175 // End of basic block 1176 if (BI.OutNeeds & StateWQM) 1177 Needs = StateWQM; 1178 else if (BI.OutNeeds == StateExact) 1179 Needs = StateExact; 1180 else 1181 Needs = StateWQM | StateExact; 1182 } 1183 1184 // Now, transition if necessary. 1185 if (!(Needs & State)) { 1186 MachineBasicBlock::iterator First; 1187 if (State == StateWWM || Needs == StateWWM) { 1188 // We must switch to or from WWM 1189 First = FirstWWM; 1190 } else { 1191 // We only need to switch to/from WQM, so we can use FirstWQM 1192 First = FirstWQM; 1193 } 1194 1195 // Whether we need to save SCC depends on start and end states 1196 bool SaveSCC = false; 1197 switch (State) { 1198 case StateExact: 1199 case StateWWM: 1200 // Exact/WWM -> WWM: save SCC 1201 // Exact/WWM -> WQM: save SCC if WQM mask is generated from exec 1202 // Exact/WWM -> Exact: no save 1203 SaveSCC = (Needs & StateWWM) || ((Needs & StateWQM) && WQMFromExec); 1204 break; 1205 case StateWQM: 1206 // WQM -> Exact/WMM: save SCC 1207 SaveSCC = !(Needs & StateWQM); 1208 break; 1209 default: 1210 llvm_unreachable("Unknown state"); 1211 break; 1212 } 1213 MachineBasicBlock::iterator Before = 1214 prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC); 1215 1216 if (State == StateWWM) { 1217 assert(SavedNonWWMReg); 1218 fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); 1219 LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); 1220 SavedNonWWMReg = 0; 1221 State = NonWWMState; 1222 } 1223 1224 if (Needs == StateWWM) { 1225 NonWWMState = State; 1226 assert(!SavedNonWWMReg); 1227 SavedNonWWMReg = MRI->createVirtualRegister(BoolRC); 1228 toWWM(MBB, Before, SavedNonWWMReg); 1229 State = StateWWM; 1230 } else { 1231 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { 1232 if (!WQMFromExec && (OutNeeds & StateWQM)) { 1233 assert(!SavedWQMReg); 1234 SavedWQMReg = MRI->createVirtualRegister(BoolRC); 1235 } 1236 1237 toExact(MBB, Before, SavedWQMReg); 1238 State = StateExact; 1239 } else if (State == StateExact && (Needs & StateWQM) && 1240 !(Needs & StateExact)) { 1241 assert(WQMFromExec == (SavedWQMReg == 0)); 1242 1243 toWQM(MBB, Before, SavedWQMReg); 1244 1245 if (SavedWQMReg) { 1246 LIS->createAndComputeVirtRegInterval(SavedWQMReg); 1247 SavedWQMReg = 0; 1248 } 1249 State = StateWQM; 1250 } else { 1251 // We can get here if we transitioned from WWM to a non-WWM state that 1252 // already matches our needs, but we shouldn't need to do anything. 1253 assert(Needs & State); 1254 } 1255 } 1256 } 1257 1258 if (Needs != (StateExact | StateWQM | StateWWM)) { 1259 if (Needs != (StateExact | StateWQM)) 1260 FirstWQM = IE; 1261 FirstWWM = IE; 1262 } 1263 1264 if (II == IE) 1265 break; 1266 1267 II = Next; 1268 } 1269 assert(!SavedWQMReg); 1270 assert(!SavedNonWWMReg); 1271 } 1272 1273 void SIWholeQuadMode::lowerLiveMaskQueries() { 1274 for (MachineInstr *MI : LiveMaskQueries) { 1275 const DebugLoc &DL = MI->getDebugLoc(); 1276 Register Dest = MI->getOperand(0).getReg(); 1277 1278 MachineInstr *Copy = 1279 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) 1280 .addReg(LiveMaskReg); 1281 1282 LIS->ReplaceMachineInstrInMaps(*MI, *Copy); 1283 MI->eraseFromParent(); 1284 } 1285 } 1286 1287 void SIWholeQuadMode::lowerCopyInstrs() { 1288 for (MachineInstr *MI : LowerToMovInstrs) { 1289 assert(MI->getNumExplicitOperands() == 2); 1290 1291 const Register Reg = MI->getOperand(0).getReg(); 1292 const unsigned SubReg = MI->getOperand(0).getSubReg(); 1293 1294 if (TRI->isVGPR(*MRI, Reg)) { 1295 const TargetRegisterClass *regClass = 1296 Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg); 1297 if (SubReg) 1298 regClass = TRI->getSubRegClass(regClass, SubReg); 1299 1300 const unsigned MovOp = TII->getMovOpcode(regClass); 1301 MI->setDesc(TII->get(MovOp)); 1302 1303 // And make it implicitly depend on exec (like all VALU movs should do). 1304 MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 1305 } else { 1306 // Remove early-clobber and exec dependency from simple SGPR copies. 1307 // This allows some to be eliminated during/post RA. 1308 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); 1309 if (MI->getOperand(0).isEarlyClobber()) { 1310 LIS->removeInterval(Reg); 1311 MI->getOperand(0).setIsEarlyClobber(false); 1312 LIS->createAndComputeVirtRegInterval(Reg); 1313 } 1314 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC); 1315 while (Index >= 0) { 1316 MI->RemoveOperand(Index); 1317 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC); 1318 } 1319 MI->setDesc(TII->get(AMDGPU::COPY)); 1320 LLVM_DEBUG(dbgs() << " -> " << *MI); 1321 } 1322 } 1323 for (MachineInstr *MI : LowerToCopyInstrs) { 1324 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || 1325 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { 1326 assert(MI->getNumExplicitOperands() == 3); 1327 // the only reason we should be here is V_SET_INACTIVE has 1328 // an undef input so it is being replaced by a simple copy. 1329 // There should be a second undef source that we should remove. 1330 assert(MI->getOperand(2).isUndef()); 1331 MI->RemoveOperand(2); 1332 MI->untieRegOperand(1); 1333 } else { 1334 assert(MI->getNumExplicitOperands() == 2); 1335 } 1336 1337 MI->setDesc(TII->get(AMDGPU::COPY)); 1338 } 1339 } 1340 1341 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { 1342 for (MachineInstr *MI : KillInstrs) { 1343 MachineBasicBlock *MBB = MI->getParent(); 1344 MachineInstr *SplitPoint = nullptr; 1345 switch (MI->getOpcode()) { 1346 case AMDGPU::SI_DEMOTE_I1: 1347 case AMDGPU::SI_KILL_I1_TERMINATOR: 1348 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM); 1349 break; 1350 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 1351 SplitPoint = lowerKillF32(*MBB, *MI); 1352 break; 1353 default: 1354 continue; 1355 } 1356 if (SplitPoint) 1357 splitBlock(MBB, SplitPoint); 1358 } 1359 } 1360 1361 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { 1362 Instructions.clear(); 1363 Blocks.clear(); 1364 LiveMaskQueries.clear(); 1365 LowerToCopyInstrs.clear(); 1366 LowerToMovInstrs.clear(); 1367 KillInstrs.clear(); 1368 StateTransition.clear(); 1369 1370 ST = &MF.getSubtarget<GCNSubtarget>(); 1371 1372 TII = ST->getInstrInfo(); 1373 TRI = &TII->getRegisterInfo(); 1374 MRI = &MF.getRegInfo(); 1375 LIS = &getAnalysis<LiveIntervals>(); 1376 MDT = &getAnalysis<MachineDominatorTree>(); 1377 PDT = &getAnalysis<MachinePostDominatorTree>(); 1378 1379 if (ST->isWave32()) { 1380 AndOpc = AMDGPU::S_AND_B32; 1381 AndN2Opc = AMDGPU::S_ANDN2_B32; 1382 XorOpc = AMDGPU::S_XOR_B32; 1383 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; 1384 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; 1385 WQMOpc = AMDGPU::S_WQM_B32; 1386 Exec = AMDGPU::EXEC_LO; 1387 } else { 1388 AndOpc = AMDGPU::S_AND_B64; 1389 AndN2Opc = AMDGPU::S_ANDN2_B64; 1390 XorOpc = AMDGPU::S_XOR_B64; 1391 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; 1392 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; 1393 WQMOpc = AMDGPU::S_WQM_B64; 1394 Exec = AMDGPU::EXEC; 1395 } 1396 1397 const char GlobalFlags = analyzeFunction(MF); 1398 const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); 1399 1400 LiveMaskReg = Exec; 1401 1402 // Shader is simple does not need WQM/WWM or any complex lowering 1403 if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() && 1404 LowerToMovInstrs.empty() && KillInstrs.empty()) { 1405 lowerLiveMaskQueries(); 1406 return !LiveMaskQueries.empty(); 1407 } 1408 1409 MachineBasicBlock &Entry = MF.front(); 1410 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); 1411 1412 // Store a copy of the original live mask when required 1413 if (NeedsLiveMask || (GlobalFlags & StateWQM)) { 1414 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); 1415 MachineInstr *MI = 1416 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) 1417 .addReg(Exec); 1418 LIS->InsertMachineInstrInMaps(*MI); 1419 } 1420 1421 LLVM_DEBUG(printInfo()); 1422 1423 lowerLiveMaskQueries(); 1424 lowerCopyInstrs(); 1425 1426 // Shader only needs WQM 1427 if (GlobalFlags == StateWQM) { 1428 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) 1429 .addReg(Exec); 1430 LIS->InsertMachineInstrInMaps(*MI); 1431 lowerKillInstrs(true); 1432 } else { 1433 for (auto BII : Blocks) 1434 processBlock(*BII.first, BII.first == &Entry); 1435 // Lowering blocks causes block splitting so perform as a second pass. 1436 for (auto BII : Blocks) 1437 lowerBlock(*BII.first); 1438 } 1439 1440 // Compute live range for live mask 1441 if (LiveMaskReg != Exec) 1442 LIS->createAndComputeVirtRegInterval(LiveMaskReg); 1443 1444 // Physical registers like SCC aren't tracked by default anyway, so just 1445 // removing the ranges we computed is the simplest option for maintaining 1446 // the analysis results. 1447 LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); 1448 1449 // If we performed any kills then recompute EXEC 1450 if (!KillInstrs.empty()) 1451 LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); 1452 1453 return true; 1454 } 1455