1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPU.h" 10 #include "GCNSubtarget.h" 11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 12 #include "SIRegisterInfo.h" 13 #include "llvm/CodeGen/LivePhysRegs.h" 14 #include "llvm/CodeGen/MachineFunctionPass.h" 15 #include "llvm/InitializePasses.h" 16 17 using namespace llvm; 18 19 #define DEBUG_TYPE "si-optimize-exec-masking" 20 21 namespace { 22 23 class SIOptimizeExecMasking : public MachineFunctionPass { 24 MachineFunction *MF = nullptr; 25 const GCNSubtarget *ST = nullptr; 26 const SIRegisterInfo *TRI = nullptr; 27 const SIInstrInfo *TII = nullptr; 28 const MachineRegisterInfo *MRI = nullptr; 29 30 Register isCopyFromExec(const MachineInstr &MI) const; 31 Register isCopyToExec(const MachineInstr &MI) const; 32 bool removeTerminatorBit(MachineInstr &MI) const; 33 MachineBasicBlock::reverse_iterator 34 fixTerminators(MachineBasicBlock &MBB) const; 35 MachineBasicBlock::reverse_iterator 36 findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, 37 unsigned CopyToExec) const; 38 39 bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, 40 MCRegister Reg, bool UseLiveOuts = false, 41 bool IgnoreStart = false) const; 42 bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; 43 MachineInstr *findInstrBackwards(MachineInstr &Origin, 44 std::function<bool(MachineInstr *)> Pred, 45 ArrayRef<MCRegister> NonModifiableRegs, 46 unsigned MaxInstructions = 20) const; 47 MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, 48 MCRegister Exec) const; 49 bool optimizeExecSequence() const; 50 bool optimizeVCmpxAndSaveexecSequence() const; 51 bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr, 52 MachineInstr &VCmp, 53 MCRegister Exec) const; 54 55 public: 56 static char ID; 57 58 SIOptimizeExecMasking() : MachineFunctionPass(ID) { 59 initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); 60 } 61 62 bool runOnMachineFunction(MachineFunction &MF) override; 63 64 StringRef getPassName() const override { 65 return "SI optimize exec mask operations"; 66 } 67 68 void getAnalysisUsage(AnalysisUsage &AU) const override { 69 AU.setPreservesCFG(); 70 MachineFunctionPass::getAnalysisUsage(AU); 71 } 72 }; 73 74 } // End anonymous namespace. 75 76 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, 77 "SI optimize exec mask operations", false, false) 78 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 79 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, 80 "SI optimize exec mask operations", false, false) 81 82 char SIOptimizeExecMasking::ID = 0; 83 84 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; 85 86 /// If \p MI is a copy from exec, return the register copied to. 87 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { 88 switch (MI.getOpcode()) { 89 case AMDGPU::COPY: 90 case AMDGPU::S_MOV_B64: 91 case AMDGPU::S_MOV_B64_term: 92 case AMDGPU::S_MOV_B32: 93 case AMDGPU::S_MOV_B32_term: { 94 const MachineOperand &Src = MI.getOperand(1); 95 if (Src.isReg() && Src.getReg() == TRI->getExec()) 96 return MI.getOperand(0).getReg(); 97 } 98 } 99 100 return AMDGPU::NoRegister; 101 } 102 103 /// If \p MI is a copy to exec, return the register copied from. 104 Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { 105 switch (MI.getOpcode()) { 106 case AMDGPU::COPY: 107 case AMDGPU::S_MOV_B64: 108 case AMDGPU::S_MOV_B32: { 109 const MachineOperand &Dst = MI.getOperand(0); 110 if (Dst.isReg() && Dst.getReg() == TRI->getExec() && 111 MI.getOperand(1).isReg()) 112 return MI.getOperand(1).getReg(); 113 break; 114 } 115 case AMDGPU::S_MOV_B64_term: 116 case AMDGPU::S_MOV_B32_term: 117 llvm_unreachable("should have been replaced"); 118 } 119 120 return Register(); 121 } 122 123 /// If \p MI is a logical operation on an exec value, 124 /// return the register copied to. 125 static Register isLogicalOpOnExec(const MachineInstr &MI) { 126 switch (MI.getOpcode()) { 127 case AMDGPU::S_AND_B64: 128 case AMDGPU::S_OR_B64: 129 case AMDGPU::S_XOR_B64: 130 case AMDGPU::S_ANDN2_B64: 131 case AMDGPU::S_ORN2_B64: 132 case AMDGPU::S_NAND_B64: 133 case AMDGPU::S_NOR_B64: 134 case AMDGPU::S_XNOR_B64: { 135 const MachineOperand &Src1 = MI.getOperand(1); 136 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) 137 return MI.getOperand(0).getReg(); 138 const MachineOperand &Src2 = MI.getOperand(2); 139 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) 140 return MI.getOperand(0).getReg(); 141 break; 142 } 143 case AMDGPU::S_AND_B32: 144 case AMDGPU::S_OR_B32: 145 case AMDGPU::S_XOR_B32: 146 case AMDGPU::S_ANDN2_B32: 147 case AMDGPU::S_ORN2_B32: 148 case AMDGPU::S_NAND_B32: 149 case AMDGPU::S_NOR_B32: 150 case AMDGPU::S_XNOR_B32: { 151 const MachineOperand &Src1 = MI.getOperand(1); 152 if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) 153 return MI.getOperand(0).getReg(); 154 const MachineOperand &Src2 = MI.getOperand(2); 155 if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) 156 return MI.getOperand(0).getReg(); 157 break; 158 } 159 } 160 161 return AMDGPU::NoRegister; 162 } 163 164 static unsigned getSaveExecOp(unsigned Opc) { 165 switch (Opc) { 166 case AMDGPU::S_AND_B64: 167 return AMDGPU::S_AND_SAVEEXEC_B64; 168 case AMDGPU::S_OR_B64: 169 return AMDGPU::S_OR_SAVEEXEC_B64; 170 case AMDGPU::S_XOR_B64: 171 return AMDGPU::S_XOR_SAVEEXEC_B64; 172 case AMDGPU::S_ANDN2_B64: 173 return AMDGPU::S_ANDN2_SAVEEXEC_B64; 174 case AMDGPU::S_ORN2_B64: 175 return AMDGPU::S_ORN2_SAVEEXEC_B64; 176 case AMDGPU::S_NAND_B64: 177 return AMDGPU::S_NAND_SAVEEXEC_B64; 178 case AMDGPU::S_NOR_B64: 179 return AMDGPU::S_NOR_SAVEEXEC_B64; 180 case AMDGPU::S_XNOR_B64: 181 return AMDGPU::S_XNOR_SAVEEXEC_B64; 182 case AMDGPU::S_AND_B32: 183 return AMDGPU::S_AND_SAVEEXEC_B32; 184 case AMDGPU::S_OR_B32: 185 return AMDGPU::S_OR_SAVEEXEC_B32; 186 case AMDGPU::S_XOR_B32: 187 return AMDGPU::S_XOR_SAVEEXEC_B32; 188 case AMDGPU::S_ANDN2_B32: 189 return AMDGPU::S_ANDN2_SAVEEXEC_B32; 190 case AMDGPU::S_ORN2_B32: 191 return AMDGPU::S_ORN2_SAVEEXEC_B32; 192 case AMDGPU::S_NAND_B32: 193 return AMDGPU::S_NAND_SAVEEXEC_B32; 194 case AMDGPU::S_NOR_B32: 195 return AMDGPU::S_NOR_SAVEEXEC_B32; 196 case AMDGPU::S_XNOR_B32: 197 return AMDGPU::S_XNOR_SAVEEXEC_B32; 198 default: 199 return AMDGPU::INSTRUCTION_LIST_END; 200 } 201 } 202 203 // These are only terminators to get correct spill code placement during 204 // register allocation, so turn them back into normal instructions. 205 bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { 206 switch (MI.getOpcode()) { 207 case AMDGPU::S_MOV_B32_term: { 208 bool RegSrc = MI.getOperand(1).isReg(); 209 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 210 return true; 211 } 212 case AMDGPU::S_MOV_B64_term: { 213 bool RegSrc = MI.getOperand(1).isReg(); 214 MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); 215 return true; 216 } 217 case AMDGPU::S_XOR_B64_term: { 218 // This is only a terminator to get the correct spill code placement during 219 // register allocation. 220 MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); 221 return true; 222 } 223 case AMDGPU::S_XOR_B32_term: { 224 // This is only a terminator to get the correct spill code placement during 225 // register allocation. 226 MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); 227 return true; 228 } 229 case AMDGPU::S_OR_B64_term: { 230 // This is only a terminator to get the correct spill code placement during 231 // register allocation. 232 MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 233 return true; 234 } 235 case AMDGPU::S_OR_B32_term: { 236 // This is only a terminator to get the correct spill code placement during 237 // register allocation. 238 MI.setDesc(TII->get(AMDGPU::S_OR_B32)); 239 return true; 240 } 241 case AMDGPU::S_ANDN2_B64_term: { 242 // This is only a terminator to get the correct spill code placement during 243 // register allocation. 244 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); 245 return true; 246 } 247 case AMDGPU::S_ANDN2_B32_term: { 248 // This is only a terminator to get the correct spill code placement during 249 // register allocation. 250 MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); 251 return true; 252 } 253 case AMDGPU::S_AND_B64_term: { 254 // This is only a terminator to get the correct spill code placement during 255 // register allocation. 256 MI.setDesc(TII->get(AMDGPU::S_AND_B64)); 257 return true; 258 } 259 case AMDGPU::S_AND_B32_term: { 260 // This is only a terminator to get the correct spill code placement during 261 // register allocation. 262 MI.setDesc(TII->get(AMDGPU::S_AND_B32)); 263 return true; 264 } 265 default: 266 return false; 267 } 268 } 269 270 // Turn all pseudoterminators in the block into their equivalent non-terminator 271 // instructions. Returns the reverse iterator to the first non-terminator 272 // instruction in the block. 273 MachineBasicBlock::reverse_iterator 274 SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { 275 MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 276 277 bool Seen = false; 278 MachineBasicBlock::reverse_iterator FirstNonTerm = I; 279 for (; I != E; ++I) { 280 if (!I->isTerminator()) 281 return Seen ? FirstNonTerm : I; 282 283 if (removeTerminatorBit(*I)) { 284 if (!Seen) { 285 FirstNonTerm = I; 286 Seen = true; 287 } 288 } 289 } 290 291 return FirstNonTerm; 292 } 293 294 MachineBasicBlock::reverse_iterator 295 SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB, 296 MachineBasicBlock::reverse_iterator I, 297 unsigned CopyToExec) const { 298 const unsigned InstLimit = 25; 299 300 auto E = MBB.rend(); 301 for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 302 Register CopyFromExec = isCopyFromExec(*I); 303 if (CopyFromExec.isValid()) 304 return I; 305 } 306 307 return E; 308 } 309 310 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly 311 // report the register as unavailable because a super-register with a lane mask 312 // is unavailable. 313 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 314 for (MachineBasicBlock *Succ : MBB.successors()) { 315 if (Succ->isLiveIn(Reg)) 316 return true; 317 } 318 319 return false; 320 } 321 322 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either 323 // the beginning of the BB is reached or Pred evaluates to true - which can be 324 // an arbitrary condition based on the current MachineInstr, for instance an 325 // target instruction. Breaks prematurely by returning nullptr if one of the 326 // registers given in NonModifiableRegs is modified by the current instruction. 327 MachineInstr *SIOptimizeExecMasking::findInstrBackwards( 328 MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 329 ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const { 330 MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), 331 E = Origin.getParent()->rend(); 332 unsigned CurrentIteration = 0; 333 334 for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { 335 if (A->isDebugInstr()) 336 continue; 337 338 if (Pred(&*A)) 339 return &*A; 340 341 for (MCRegister Reg : NonModifiableRegs) { 342 if (A->modifiesRegister(Reg, TRI)) 343 return nullptr; 344 } 345 346 ++CurrentIteration; 347 } 348 349 return nullptr; 350 } 351 352 // Determine if a register Reg is not re-defined and still in use 353 // in the range (Stop..Start]. 354 // It does so by backwards calculating liveness from the end of the BB until 355 // either Stop or the beginning of the BB is reached. 356 // After liveness is calculated, we can determine if Reg is still in use and not 357 // defined inbetween the instructions. 358 bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, 359 MachineInstr &Start, 360 MCRegister Reg, 361 bool UseLiveOuts, 362 bool IgnoreStart) const { 363 LivePhysRegs LR(*TRI); 364 if (UseLiveOuts) 365 LR.addLiveOuts(*Stop.getParent()); 366 367 MachineBasicBlock::reverse_iterator A(Start); 368 MachineBasicBlock::reverse_iterator E(Stop); 369 370 if (IgnoreStart) 371 ++A; 372 373 for (; A != Stop.getParent()->rend() && A != Stop; ++A) { 374 LR.stepBackward(*A); 375 } 376 377 return !LR.available(*MRI, Reg); 378 } 379 380 // Determine if a register Reg is not re-defined and still in use 381 // in the range (Stop..BB.end]. 382 bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, 383 MCRegister Reg) const { 384 return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); 385 } 386 387 // Optimize sequences emitted for control flow lowering. They are originally 388 // emitted as the separate operations because spill code may need to be 389 // inserted for the saved copy of exec. 390 // 391 // x = copy exec 392 // z = s_<op>_b64 x, y 393 // exec = copy z 394 // => 395 // x = s_<op>_saveexec_b64 y 396 // 397 bool SIOptimizeExecMasking::optimizeExecSequence() const { 398 MCRegister Exec = TRI->getExec(); 399 400 bool Changed = false; 401 for (MachineBasicBlock &MBB : *MF) { 402 MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); 403 MachineBasicBlock::reverse_iterator E = MBB.rend(); 404 if (I == E) 405 continue; 406 407 // It's possible to see other terminator copies after the exec copy. This 408 // can happen if control flow pseudos had their outputs used by phis. 409 Register CopyToExec; 410 411 unsigned SearchCount = 0; 412 const unsigned SearchLimit = 5; 413 while (I != E && SearchCount++ < SearchLimit) { 414 CopyToExec = isCopyToExec(*I); 415 if (CopyToExec) 416 break; 417 ++I; 418 } 419 420 if (!CopyToExec) 421 continue; 422 423 // Scan backwards to find the def. 424 auto *CopyToExecInst = &*I; 425 auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec); 426 if (CopyFromExecInst == E) { 427 auto PrepareExecInst = std::next(I); 428 if (PrepareExecInst == E) 429 continue; 430 // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec 431 if (CopyToExecInst->getOperand(1).isKill() && 432 isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { 433 LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); 434 435 PrepareExecInst->getOperand(0).setReg(Exec); 436 437 LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); 438 439 CopyToExecInst->eraseFromParent(); 440 Changed = true; 441 } 442 443 continue; 444 } 445 446 if (isLiveOut(MBB, CopyToExec)) { 447 // The copied register is live out and has a second use in another block. 448 LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); 449 continue; 450 } 451 452 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 453 MachineInstr *SaveExecInst = nullptr; 454 SmallVector<MachineInstr *, 4> OtherUseInsts; 455 456 for (MachineBasicBlock::iterator 457 J = std::next(CopyFromExecInst->getIterator()), 458 JE = I->getIterator(); 459 J != JE; ++J) { 460 if (SaveExecInst && J->readsRegister(Exec, TRI)) { 461 LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 462 // Make sure this is inserted after any VALU ops that may have been 463 // scheduled in between. 464 SaveExecInst = nullptr; 465 break; 466 } 467 468 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); 469 470 if (J->modifiesRegister(CopyToExec, TRI)) { 471 if (SaveExecInst) { 472 LLVM_DEBUG(dbgs() << "Multiple instructions modify " 473 << printReg(CopyToExec, TRI) << '\n'); 474 SaveExecInst = nullptr; 475 break; 476 } 477 478 unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 479 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 480 break; 481 482 if (ReadsCopyFromExec) { 483 SaveExecInst = &*J; 484 LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 485 continue; 486 } else { 487 LLVM_DEBUG(dbgs() 488 << "Instruction does not read exec copy: " << *J << '\n'); 489 break; 490 } 491 } else if (ReadsCopyFromExec && !SaveExecInst) { 492 // Make sure no other instruction is trying to use this copy, before it 493 // will be rewritten by the saveexec, i.e. hasOneUse. There may have 494 // been another use, such as an inserted spill. For example: 495 // 496 // %sgpr0_sgpr1 = COPY %exec 497 // spill %sgpr0_sgpr1 498 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 499 // 500 LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J 501 << '\n'); 502 break; 503 } 504 505 if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 506 assert(SaveExecInst != &*J); 507 OtherUseInsts.push_back(&*J); 508 } 509 } 510 511 if (!SaveExecInst) 512 continue; 513 514 LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 515 516 MachineOperand &Src0 = SaveExecInst->getOperand(1); 517 MachineOperand &Src1 = SaveExecInst->getOperand(2); 518 519 MachineOperand *OtherOp = nullptr; 520 521 if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 522 OtherOp = &Src1; 523 } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 524 if (!SaveExecInst->isCommutable()) 525 break; 526 527 OtherOp = &Src0; 528 } else 529 llvm_unreachable("unexpected"); 530 531 CopyFromExecInst->eraseFromParent(); 532 533 auto InsPt = SaveExecInst->getIterator(); 534 const DebugLoc &DL = SaveExecInst->getDebugLoc(); 535 536 BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 537 CopyFromExec) 538 .addReg(OtherOp->getReg()); 539 SaveExecInst->eraseFromParent(); 540 541 CopyToExecInst->eraseFromParent(); 542 543 for (MachineInstr *OtherInst : OtherUseInsts) { 544 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, 545 *TRI); 546 } 547 548 Changed = true; 549 } 550 551 return Changed; 552 } 553 554 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence 555 // by looking at an instance of a s_and_saveexec instruction. Returns a pointer 556 // to the v_cmp instruction if it is safe to replace the sequence (see the 557 // conditions in the function body). This is after register allocation, so some 558 // checks on operand dependencies need to be considered. 559 MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization( 560 MachineInstr &SaveExec, MCRegister Exec) const { 561 562 MachineInstr *VCmp = nullptr; 563 564 Register SaveExecDest = SaveExec.getOperand(0).getReg(); 565 if (!TRI->isSGPRReg(*MRI, SaveExecDest)) 566 return nullptr; 567 568 MachineOperand *SaveExecSrc0 = 569 TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); 570 if (!SaveExecSrc0->isReg()) 571 return nullptr; 572 573 // Try to find the last v_cmp instruction that defs the saveexec input 574 // operand without any write to Exec or the saveexec input operand inbetween. 575 VCmp = findInstrBackwards( 576 SaveExec, 577 [&](MachineInstr *Check) { 578 return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && 579 Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); 580 }, 581 {Exec, SaveExecSrc0->getReg()}); 582 583 if (!VCmp) 584 return nullptr; 585 586 MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); 587 assert(VCmpDest && "Should have an sdst operand!"); 588 589 // Check if any of the v_cmp source operands is written by the saveexec. 590 MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); 591 if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && 592 SaveExec.modifiesRegister(Src0->getReg(), TRI)) 593 return nullptr; 594 595 MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); 596 if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && 597 SaveExec.modifiesRegister(Src1->getReg(), TRI)) 598 return nullptr; 599 600 // Don't do the transformation if the destination operand is included in 601 // it's MBB Live-outs, meaning it's used in any of it's successors, leading 602 // to incorrect code if the v_cmp and therefore the def of 603 // the dest operand is removed. 604 if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) 605 return nullptr; 606 607 // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the 608 // s_and_saveexec, skip the optimization. 609 if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false, 610 true) || 611 isRegisterInUseAfter(SaveExec, VCmpDest->getReg())) 612 return nullptr; 613 614 // Try to determine if there is a write to any of the VCmp 615 // operands between the saveexec and the vcmp. 616 // If yes, additional VGPR spilling might need to be inserted. In this case, 617 // it's not worth replacing the instruction sequence. 618 SmallVector<MCRegister, 2> NonDefRegs; 619 if (Src0->isReg()) 620 NonDefRegs.push_back(Src0->getReg()); 621 622 if (Src1->isReg()) 623 NonDefRegs.push_back(Src1->getReg()); 624 625 if (!findInstrBackwards( 626 SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, 627 NonDefRegs)) 628 return nullptr; 629 630 return VCmp; 631 } 632 633 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the 634 // operands extracted from a v_cmp ..., s_and_saveexec pattern. 635 bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( 636 MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { 637 const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); 638 639 if (NewOpcode == -1) 640 return false; 641 642 MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); 643 MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); 644 645 Register MoveDest = SaveExecInstr.getOperand(0).getReg(); 646 647 MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); 648 if (!SaveExecInstr.uses().empty()) { 649 bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; 650 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 651 BuildMI(*SaveExecInstr.getParent(), InsertPosIt, 652 SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) 653 .addReg(Exec); 654 } 655 656 // Omit dst as V_CMPX is implicitly writing to EXEC. 657 // Add dummy src and clamp modifiers, if needed. 658 auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), 659 VCmp.getDebugLoc(), TII->get(NewOpcode)); 660 661 auto TryAddImmediateValueFromNamedOperand = 662 [&](unsigned OperandName) -> void { 663 if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) 664 Builder.addImm(Mod->getImm()); 665 }; 666 667 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); 668 Builder.add(*Src0); 669 670 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); 671 Builder.add(*Src1); 672 673 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); 674 675 // The kill flags may no longer be correct. 676 if (Src0->isReg()) 677 MRI->clearKillFlags(Src0->getReg()); 678 if (Src1->isReg()) 679 MRI->clearKillFlags(Src1->getReg()); 680 681 return true; 682 } 683 684 // After all s_op_saveexec instructions are inserted, 685 // replace (on GFX10.3 and later) 686 // v_cmp_* SGPR, IMM, VGPR 687 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR 688 // with 689 // s_mov_b32 EXEC_SGPR_DEST, exec_lo 690 // v_cmpx_* IMM, VGPR 691 // to reduce pipeline stalls. 692 bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const { 693 if (!ST->hasGFX10_3Insts()) 694 return false; 695 696 bool Changed = false; 697 698 DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; 699 MCRegister Exec = TRI->getExec(); 700 const unsigned AndSaveExecOpcode = 701 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 702 703 for (MachineBasicBlock &MBB : *MF) { 704 for (MachineInstr &MI : MBB) { 705 // Record relevant v_cmp / s_and_saveexec instruction pairs for 706 // replacement. 707 if (MI.getOpcode() != AndSaveExecOpcode) 708 continue; 709 710 if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec)) 711 SaveExecVCmpMapping[&MI] = VCmp; 712 } 713 } 714 715 for (const auto &Entry : SaveExecVCmpMapping) { 716 MachineInstr *SaveExecInstr = Entry.getFirst(); 717 MachineInstr *VCmpInstr = Entry.getSecond(); 718 719 if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) { 720 SaveExecInstr->eraseFromParent(); 721 VCmpInstr->eraseFromParent(); 722 723 Changed = true; 724 } 725 } 726 727 return Changed; 728 } 729 730 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { 731 if (skipFunction(MF.getFunction())) 732 return false; 733 734 this->MF = &MF; 735 ST = &MF.getSubtarget<GCNSubtarget>(); 736 TRI = ST->getRegisterInfo(); 737 TII = ST->getInstrInfo(); 738 MRI = &MF.getRegInfo(); 739 740 bool Changed = optimizeExecSequence(); 741 Changed |= optimizeVCmpxAndSaveexecSequence(); 742 743 return Changed; 744 } 745