1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// The pass tries to use the 32-bit encoding for instructions when possible. 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/CodeGen/MachineFunctionPass.h" 18 #include "llvm/CodeGen/MachineInstrBuilder.h" 19 #include "llvm/CodeGen/MachineRegisterInfo.h" 20 #include "llvm/IR/Constants.h" 21 #include "llvm/IR/Function.h" 22 #include "llvm/IR/LLVMContext.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Support/raw_ostream.h" 25 #include "llvm/Target/TargetMachine.h" 26 27 #define DEBUG_TYPE "si-shrink-instructions" 28 29 STATISTIC(NumInstructionsShrunk, 30 "Number of 64-bit instruction reduced to 32-bit."); 31 STATISTIC(NumLiteralConstantsFolded, 32 "Number of literal constants folded into 32-bit instructions."); 33 34 using namespace llvm; 35 36 namespace { 37 38 class SIShrinkInstructions : public MachineFunctionPass { 39 public: 40 static char ID; 41 42 public: 43 SIShrinkInstructions() : MachineFunctionPass(ID) { 44 } 45 46 bool runOnMachineFunction(MachineFunction &MF) override; 47 48 StringRef getPassName() const override { return "SI Shrink Instructions"; } 49 50 void getAnalysisUsage(AnalysisUsage &AU) const override { 51 AU.setPreservesCFG(); 52 MachineFunctionPass::getAnalysisUsage(AU); 53 } 54 }; 55 56 } // End anonymous namespace. 57 58 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 59 "SI Shrink Instructions", false, false) 60 61 char SIShrinkInstructions::ID = 0; 62 63 FunctionPass *llvm::createSIShrinkInstructionsPass() { 64 return new SIShrinkInstructions(); 65 } 66 67 /// This function checks \p MI for operands defined by a move immediate 68 /// instruction and then folds the literal constant into the instruction if it 69 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 70 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, 71 MachineRegisterInfo &MRI, bool TryToCommute = true) { 72 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 73 74 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 75 76 // Try to fold Src0 77 MachineOperand &Src0 = MI.getOperand(Src0Idx); 78 if (Src0.isReg()) { 79 unsigned Reg = Src0.getReg(); 80 if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { 81 MachineInstr *Def = MRI.getUniqueVRegDef(Reg); 82 if (Def && Def->isMoveImmediate()) { 83 MachineOperand &MovSrc = Def->getOperand(1); 84 bool ConstantFolded = false; 85 86 if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || 87 isUInt<32>(MovSrc.getImm()))) { 88 // It's possible to have only one component of a super-reg defined by 89 // a single mov, so we need to clear any subregister flag. 90 Src0.setSubReg(0); 91 Src0.ChangeToImmediate(MovSrc.getImm()); 92 ConstantFolded = true; 93 } else if (MovSrc.isFI()) { 94 Src0.setSubReg(0); 95 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 96 ConstantFolded = true; 97 } 98 99 if (ConstantFolded) { 100 assert(MRI.use_empty(Reg)); 101 Def->eraseFromParent(); 102 ++NumLiteralConstantsFolded; 103 return true; 104 } 105 } 106 } 107 } 108 109 // We have failed to fold src0, so commute the instruction and try again. 110 if (TryToCommute && MI.isCommutable()) { 111 if (TII->commuteInstruction(MI)) { 112 if (foldImmediates(MI, TII, MRI, false)) 113 return true; 114 115 // Commute back. 116 TII->commuteInstruction(MI); 117 } 118 } 119 120 return false; 121 } 122 123 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 124 return isInt<16>(Src.getImm()) && 125 !TII->isInlineConstant(*Src.getParent(), 126 Src.getParent()->getOperandNo(&Src)); 127 } 128 129 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 130 return isUInt<16>(Src.getImm()) && 131 !TII->isInlineConstant(*Src.getParent(), 132 Src.getParent()->getOperandNo(&Src)); 133 } 134 135 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, 136 const MachineOperand &Src, 137 bool &IsUnsigned) { 138 if (isInt<16>(Src.getImm())) { 139 IsUnsigned = false; 140 return !TII->isInlineConstant(Src); 141 } 142 143 if (isUInt<16>(Src.getImm())) { 144 IsUnsigned = true; 145 return !TII->isInlineConstant(Src); 146 } 147 148 return false; 149 } 150 151 /// \returns true if the constant in \p Src should be replaced with a bitreverse 152 /// of an inline immediate. 153 static bool isReverseInlineImm(const SIInstrInfo *TII, 154 const MachineOperand &Src, 155 int32_t &ReverseImm) { 156 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 157 return false; 158 159 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 160 return ReverseImm >= -16 && ReverseImm <= 64; 161 } 162 163 /// Copy implicit register operands from specified instruction to this 164 /// instruction that are not part of the instruction definition. 165 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, 166 const MachineInstr &MI) { 167 for (unsigned i = MI.getDesc().getNumOperands() + 168 MI.getDesc().getNumImplicitUses() + 169 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 170 i != e; ++i) { 171 const MachineOperand &MO = MI.getOperand(i); 172 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 173 NewMI.addOperand(MF, MO); 174 } 175 } 176 177 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { 178 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 179 // get constants on the RHS. 180 if (!MI.getOperand(0).isReg()) 181 TII->commuteInstruction(MI, false, 0, 1); 182 183 const MachineOperand &Src1 = MI.getOperand(1); 184 if (!Src1.isImm()) 185 return; 186 187 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 188 if (SOPKOpc == -1) 189 return; 190 191 // eq/ne is special because the imm16 can be treated as signed or unsigned, 192 // and initially selectd to the unsigned versions. 193 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 194 bool HasUImm; 195 if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { 196 if (!HasUImm) { 197 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 198 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 199 } 200 201 MI.setDesc(TII->get(SOPKOpc)); 202 } 203 204 return; 205 } 206 207 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 208 209 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || 210 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { 211 MI.setDesc(NewDesc); 212 } 213 } 214 215 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 216 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 217 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 218 /// XNOR (as a ^ b == ~(a ^ ~b)). 219 /// \returns true if the caller should continue the machine function iterator 220 static bool shrinkScalarLogicOp(const GCNSubtarget &ST, 221 MachineRegisterInfo &MRI, 222 const SIInstrInfo *TII, 223 MachineInstr &MI) { 224 unsigned Opc = MI.getOpcode(); 225 const MachineOperand *Dest = &MI.getOperand(0); 226 MachineOperand *Src0 = &MI.getOperand(1); 227 MachineOperand *Src1 = &MI.getOperand(2); 228 MachineOperand *SrcReg = Src0; 229 MachineOperand *SrcImm = Src1; 230 231 if (SrcImm->isImm() && 232 !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { 233 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 234 uint32_t NewImm = 0; 235 236 if (Opc == AMDGPU::S_AND_B32) { 237 if (isPowerOf2_32(~Imm)) { 238 NewImm = countTrailingOnes(Imm); 239 Opc = AMDGPU::S_BITSET0_B32; 240 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 241 NewImm = ~Imm; 242 Opc = AMDGPU::S_ANDN2_B32; 243 } 244 } else if (Opc == AMDGPU::S_OR_B32) { 245 if (isPowerOf2_32(Imm)) { 246 NewImm = countTrailingZeros(Imm); 247 Opc = AMDGPU::S_BITSET1_B32; 248 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 249 NewImm = ~Imm; 250 Opc = AMDGPU::S_ORN2_B32; 251 } 252 } else if (Opc == AMDGPU::S_XOR_B32) { 253 if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 254 NewImm = ~Imm; 255 Opc = AMDGPU::S_XNOR_B32; 256 } 257 } else { 258 llvm_unreachable("unexpected opcode"); 259 } 260 261 if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && 262 SrcImm == Src0) { 263 if (!TII->commuteInstruction(MI, false, 1, 2)) 264 NewImm = 0; 265 } 266 267 if (NewImm != 0) { 268 if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && 269 SrcReg->isReg()) { 270 MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 271 MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 272 return true; 273 } 274 275 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 276 MI.setDesc(TII->get(Opc)); 277 if (Opc == AMDGPU::S_BITSET0_B32 || 278 Opc == AMDGPU::S_BITSET1_B32) { 279 Src0->ChangeToImmediate(NewImm); 280 MI.RemoveOperand(2); 281 } else { 282 SrcImm->setImm(NewImm); 283 } 284 } 285 } 286 } 287 288 return false; 289 } 290 291 // This is the same as MachineInstr::readsRegister/modifiesRegister except 292 // it takes subregs into account. 293 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 294 unsigned Reg, unsigned SubReg, 295 const SIRegisterInfo &TRI) { 296 for (const MachineOperand &MO : R) { 297 if (!MO.isReg()) 298 continue; 299 300 if (TargetRegisterInfo::isPhysicalRegister(Reg) && 301 TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { 302 if (TRI.regsOverlap(Reg, MO.getReg())) 303 return true; 304 } else if (MO.getReg() == Reg && 305 TargetRegisterInfo::isVirtualRegister(Reg)) { 306 LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & 307 TRI.getSubRegIndexLaneMask(MO.getSubReg()); 308 if (Overlap.any()) 309 return true; 310 } 311 } 312 return false; 313 } 314 315 static bool instReadsReg(const MachineInstr *MI, 316 unsigned Reg, unsigned SubReg, 317 const SIRegisterInfo &TRI) { 318 return instAccessReg(MI->uses(), Reg, SubReg, TRI); 319 } 320 321 static bool instModifiesReg(const MachineInstr *MI, 322 unsigned Reg, unsigned SubReg, 323 const SIRegisterInfo &TRI) { 324 return instAccessReg(MI->defs(), Reg, SubReg, TRI); 325 } 326 327 static TargetInstrInfo::RegSubRegPair 328 getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, 329 const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { 330 if (TRI.getRegSizeInBits(Reg, MRI) != 32) { 331 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 332 Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); 333 } else { 334 LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); 335 Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger())); 336 } 337 } 338 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 339 } 340 341 // Match: 342 // mov t, x 343 // mov x, y 344 // mov y, t 345 // 346 // => 347 // 348 // mov t, x (t is potentially dead and move eliminated) 349 // v_swap_b32 x, y 350 // 351 // Returns next valid instruction pointer if was able to create v_swap_b32. 352 // 353 // This shall not be done too early not to prevent possible folding which may 354 // remove matched moves, and this should prefereably be done before RA to 355 // release saved registers and also possibly after RA which can insert copies 356 // too. 357 // 358 // This is really just a generic peephole that is not a canocical shrinking, 359 // although requirements match the pass placement and it reduces code size too. 360 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, 361 const SIInstrInfo *TII) { 362 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 363 MovT.getOpcode() == AMDGPU::COPY); 364 365 unsigned T = MovT.getOperand(0).getReg(); 366 unsigned Tsub = MovT.getOperand(0).getSubReg(); 367 MachineOperand &Xop = MovT.getOperand(1); 368 369 if (!Xop.isReg()) 370 return nullptr; 371 unsigned X = Xop.getReg(); 372 unsigned Xsub = Xop.getSubReg(); 373 374 unsigned Size = TII->getOpSize(MovT, 0) / 4; 375 376 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 377 if (!TRI.isVGPR(MRI, X)) 378 return nullptr; 379 380 for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) { 381 if (YTop.getSubReg() != Tsub) 382 continue; 383 384 MachineInstr &MovY = *YTop.getParent(); 385 if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 && 386 MovY.getOpcode() != AMDGPU::COPY) || 387 MovY.getOperand(1).getSubReg() != Tsub) 388 continue; 389 390 unsigned Y = MovY.getOperand(0).getReg(); 391 unsigned Ysub = MovY.getOperand(0).getSubReg(); 392 393 if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) 394 continue; 395 396 MachineInstr *MovX = nullptr; 397 auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end(); 398 for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) { 399 if (instReadsReg(&*I, X, Xsub, TRI) || 400 instModifiesReg(&*I, Y, Ysub, TRI) || 401 instModifiesReg(&*I, T, Tsub, TRI) || 402 (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { 403 MovX = nullptr; 404 break; 405 } 406 if (!instReadsReg(&*I, Y, Ysub, TRI)) { 407 if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { 408 MovX = nullptr; 409 break; 410 } 411 continue; 412 } 413 if (MovX || 414 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 415 I->getOpcode() != AMDGPU::COPY) || 416 I->getOperand(0).getReg() != X || 417 I->getOperand(0).getSubReg() != Xsub) { 418 MovX = nullptr; 419 break; 420 } 421 MovX = &*I; 422 } 423 424 if (!MovX || I == E) 425 continue; 426 427 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY); 428 429 for (unsigned I = 0; I < Size; ++I) { 430 TargetInstrInfo::RegSubRegPair X1, Y1; 431 X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); 432 Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); 433 BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(), 434 TII->get(AMDGPU::V_SWAP_B32)) 435 .addDef(X1.Reg, 0, X1.SubReg) 436 .addDef(Y1.Reg, 0, Y1.SubReg) 437 .addReg(Y1.Reg, 0, Y1.SubReg) 438 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 439 } 440 MovX->eraseFromParent(); 441 MovY.eraseFromParent(); 442 MachineInstr *Next = &*std::next(MovT.getIterator()); 443 if (MRI.use_nodbg_empty(T)) 444 MovT.eraseFromParent(); 445 else 446 Xop.setIsKill(false); 447 448 return Next; 449 } 450 451 return nullptr; 452 } 453 454 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 455 if (skipFunction(MF.getFunction())) 456 return false; 457 458 MachineRegisterInfo &MRI = MF.getRegInfo(); 459 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 460 const SIInstrInfo *TII = ST.getInstrInfo(); 461 462 std::vector<unsigned> I1Defs; 463 464 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 465 BI != BE; ++BI) { 466 467 MachineBasicBlock &MBB = *BI; 468 MachineBasicBlock::iterator I, Next; 469 for (I = MBB.begin(); I != MBB.end(); I = Next) { 470 Next = std::next(I); 471 MachineInstr &MI = *I; 472 473 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 474 // If this has a literal constant source that is the same as the 475 // reversed bits of an inline immediate, replace with a bitreverse of 476 // that constant. This saves 4 bytes in the common case of materializing 477 // sign bits. 478 479 // Test if we are after regalloc. We only want to do this after any 480 // optimizations happen because this will confuse them. 481 // XXX - not exactly a check for post-regalloc run. 482 MachineOperand &Src = MI.getOperand(1); 483 if (Src.isImm() && 484 TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { 485 int32_t ReverseImm; 486 if (isReverseInlineImm(TII, Src, ReverseImm)) { 487 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 488 Src.setImm(ReverseImm); 489 continue; 490 } 491 } 492 } 493 494 if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 495 MI.getOpcode() == AMDGPU::COPY)) { 496 if (auto *NextMI = matchSwap(MI, MRI, TII)) { 497 Next = NextMI->getIterator(); 498 continue; 499 } 500 } 501 502 // Combine adjacent s_nops to use the immediate operand encoding how long 503 // to wait. 504 // 505 // s_nop N 506 // s_nop M 507 // => 508 // s_nop (N + M) 509 if (MI.getOpcode() == AMDGPU::S_NOP && 510 Next != MBB.end() && 511 (*Next).getOpcode() == AMDGPU::S_NOP) { 512 513 MachineInstr &NextMI = *Next; 514 // The instruction encodes the amount to wait with an offset of 1, 515 // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back 516 // after adding. 517 uint8_t Nop0 = MI.getOperand(0).getImm() + 1; 518 uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; 519 520 // Make sure we don't overflow the bounds. 521 if (Nop0 + Nop1 <= 8) { 522 NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); 523 MI.eraseFromParent(); 524 } 525 526 continue; 527 } 528 529 // FIXME: We also need to consider movs of constant operands since 530 // immediate operands are not folded if they have more than one use, and 531 // the operand folding pass is unaware if the immediate will be free since 532 // it won't know if the src == dest constraint will end up being 533 // satisfied. 534 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 535 MI.getOpcode() == AMDGPU::S_MUL_I32) { 536 const MachineOperand *Dest = &MI.getOperand(0); 537 MachineOperand *Src0 = &MI.getOperand(1); 538 MachineOperand *Src1 = &MI.getOperand(2); 539 540 if (!Src0->isReg() && Src1->isReg()) { 541 if (TII->commuteInstruction(MI, false, 1, 2)) 542 std::swap(Src0, Src1); 543 } 544 545 // FIXME: This could work better if hints worked with subregisters. If 546 // we have a vector add of a constant, we usually don't get the correct 547 // allocation due to the subregister usage. 548 if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && 549 Src0->isReg()) { 550 MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 551 MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 552 continue; 553 } 554 555 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 556 if (Src1->isImm() && isKImmOperand(TII, *Src1)) { 557 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 558 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 559 560 MI.setDesc(TII->get(Opc)); 561 MI.tieOperands(0, 1); 562 } 563 } 564 } 565 566 // Try to use s_cmpk_* 567 if (MI.isCompare() && TII->isSOPC(MI)) { 568 shrinkScalarCompare(TII, MI); 569 continue; 570 } 571 572 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 573 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 574 const MachineOperand &Dst = MI.getOperand(0); 575 MachineOperand &Src = MI.getOperand(1); 576 577 if (Src.isImm() && 578 TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) { 579 int32_t ReverseImm; 580 if (isKImmOperand(TII, Src)) 581 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 582 else if (isReverseInlineImm(TII, Src, ReverseImm)) { 583 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 584 Src.setImm(ReverseImm); 585 } 586 } 587 588 continue; 589 } 590 591 // Shrink scalar logic operations. 592 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 593 MI.getOpcode() == AMDGPU::S_OR_B32 || 594 MI.getOpcode() == AMDGPU::S_XOR_B32) { 595 if (shrinkScalarLogicOp(ST, MRI, TII, MI)) 596 continue; 597 } 598 599 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 600 continue; 601 602 if (!TII->canShrink(MI, MRI)) { 603 // Try commuting the instruction and see if that enables us to shrink 604 // it. 605 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 606 !TII->canShrink(MI, MRI)) 607 continue; 608 } 609 610 // getVOPe32 could be -1 here if we started with an instruction that had 611 // a 32-bit encoding and then commuted it to an instruction that did not. 612 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 613 continue; 614 615 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 616 617 if (TII->isVOPC(Op32)) { 618 unsigned DstReg = MI.getOperand(0).getReg(); 619 if (TargetRegisterInfo::isVirtualRegister(DstReg)) { 620 // VOPC instructions can only write to the VCC register. We can't 621 // force them to use VCC here, because this is only one register and 622 // cannot deal with sequences which would require multiple copies of 623 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 624 // 625 // So, instead of forcing the instruction to write to VCC, we provide 626 // a hint to the register allocator to use VCC and then we will run 627 // this pass again after RA and shrink it if it outputs to VCC. 628 MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); 629 continue; 630 } 631 if (DstReg != AMDGPU::VCC) 632 continue; 633 } 634 635 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 636 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 637 // instructions. 638 const MachineOperand *Src2 = 639 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 640 if (!Src2->isReg()) 641 continue; 642 unsigned SReg = Src2->getReg(); 643 if (TargetRegisterInfo::isVirtualRegister(SReg)) { 644 MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); 645 continue; 646 } 647 if (SReg != AMDGPU::VCC) 648 continue; 649 } 650 651 // Check for the bool flag output for instructions like V_ADD_I32_e64. 652 const MachineOperand *SDst = TII->getNamedOperand(MI, 653 AMDGPU::OpName::sdst); 654 655 // Check the carry-in operand for v_addc_u32_e64. 656 const MachineOperand *Src2 = TII->getNamedOperand(MI, 657 AMDGPU::OpName::src2); 658 659 if (SDst) { 660 if (SDst->getReg() != AMDGPU::VCC) { 661 if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) 662 MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); 663 continue; 664 } 665 666 // All of the instructions with carry outs also have an SGPR input in 667 // src2. 668 if (Src2 && Src2->getReg() != AMDGPU::VCC) { 669 if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) 670 MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); 671 672 continue; 673 } 674 } 675 676 // We can shrink this instruction 677 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 678 679 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 680 ++NumInstructionsShrunk; 681 682 // Copy extra operands not present in the instruction definition. 683 copyExtraImplicitOps(*Inst32, MF, MI); 684 685 MI.eraseFromParent(); 686 foldImmediates(*Inst32, TII, MRI); 687 688 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 689 } 690 } 691 return false; 692 } 693