1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "AMDGPUSubtarget.h" 13 #include "SIInstrInfo.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/IR/Constants.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/LLVMContext.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/raw_ostream.h" 24 #include "llvm/Target/TargetMachine.h" 25 26 #define DEBUG_TYPE "si-shrink-instructions" 27 28 STATISTIC(NumInstructionsShrunk, 29 "Number of 64-bit instruction reduced to 32-bit."); 30 STATISTIC(NumLiteralConstantsFolded, 31 "Number of literal constants folded into 32-bit instructions."); 32 33 using namespace llvm; 34 35 namespace { 36 37 class SIShrinkInstructions : public MachineFunctionPass { 38 public: 39 static char ID; 40 41 public: 42 SIShrinkInstructions() : MachineFunctionPass(ID) { 43 } 44 45 bool runOnMachineFunction(MachineFunction &MF) override; 46 47 StringRef getPassName() const override { return "SI Shrink Instructions"; } 48 49 void getAnalysisUsage(AnalysisUsage &AU) const override { 50 AU.setPreservesCFG(); 51 MachineFunctionPass::getAnalysisUsage(AU); 52 } 53 }; 54 55 } // End anonymous namespace. 56 57 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 58 "SI Shrink Instructions", false, false) 59 60 char SIShrinkInstructions::ID = 0; 61 62 FunctionPass *llvm::createSIShrinkInstructionsPass() { 63 return new SIShrinkInstructions(); 64 } 65 66 /// This function checks \p MI for operands defined by a move immediate 67 /// instruction and then folds the literal constant into the instruction if it 68 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 69 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, 70 MachineRegisterInfo &MRI, bool TryToCommute = true) { 71 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 72 73 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 74 75 // Try to fold Src0 76 MachineOperand &Src0 = MI.getOperand(Src0Idx); 77 if (Src0.isReg()) { 78 unsigned Reg = Src0.getReg(); 79 if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { 80 MachineInstr *Def = MRI.getUniqueVRegDef(Reg); 81 if (Def && Def->isMoveImmediate()) { 82 MachineOperand &MovSrc = Def->getOperand(1); 83 bool ConstantFolded = false; 84 85 if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || 86 isUInt<32>(MovSrc.getImm()))) { 87 // It's possible to have only one component of a super-reg defined by 88 // a single mov, so we need to clear any subregister flag. 89 Src0.setSubReg(0); 90 Src0.ChangeToImmediate(MovSrc.getImm()); 91 ConstantFolded = true; 92 } else if (MovSrc.isFI()) { 93 Src0.setSubReg(0); 94 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 95 ConstantFolded = true; 96 } 97 98 if (ConstantFolded) { 99 assert(MRI.use_empty(Reg)); 100 Def->eraseFromParent(); 101 ++NumLiteralConstantsFolded; 102 return true; 103 } 104 } 105 } 106 } 107 108 // We have failed to fold src0, so commute the instruction and try again. 109 if (TryToCommute && MI.isCommutable()) { 110 if (TII->commuteInstruction(MI)) { 111 if (foldImmediates(MI, TII, MRI, false)) 112 return true; 113 114 // Commute back. 115 TII->commuteInstruction(MI); 116 } 117 } 118 119 return false; 120 } 121 122 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 123 return isInt<16>(Src.getImm()) && 124 !TII->isInlineConstant(*Src.getParent(), 125 Src.getParent()->getOperandNo(&Src)); 126 } 127 128 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 129 return isUInt<16>(Src.getImm()) && 130 !TII->isInlineConstant(*Src.getParent(), 131 Src.getParent()->getOperandNo(&Src)); 132 } 133 134 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, 135 const MachineOperand &Src, 136 bool &IsUnsigned) { 137 if (isInt<16>(Src.getImm())) { 138 IsUnsigned = false; 139 return !TII->isInlineConstant(Src); 140 } 141 142 if (isUInt<16>(Src.getImm())) { 143 IsUnsigned = true; 144 return !TII->isInlineConstant(Src); 145 } 146 147 return false; 148 } 149 150 /// \returns true if the constant in \p Src should be replaced with a bitreverse 151 /// of an inline immediate. 152 static bool isReverseInlineImm(const SIInstrInfo *TII, 153 const MachineOperand &Src, 154 int32_t &ReverseImm) { 155 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 156 return false; 157 158 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 159 return ReverseImm >= -16 && ReverseImm <= 64; 160 } 161 162 /// Copy implicit register operands from specified instruction to this 163 /// instruction that are not part of the instruction definition. 164 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, 165 const MachineInstr &MI) { 166 for (unsigned i = MI.getDesc().getNumOperands() + 167 MI.getDesc().getNumImplicitUses() + 168 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 169 i != e; ++i) { 170 const MachineOperand &MO = MI.getOperand(i); 171 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 172 NewMI.addOperand(MF, MO); 173 } 174 } 175 176 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { 177 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 178 // get constants on the RHS. 179 if (!MI.getOperand(0).isReg()) 180 TII->commuteInstruction(MI, false, 0, 1); 181 182 const MachineOperand &Src1 = MI.getOperand(1); 183 if (!Src1.isImm()) 184 return; 185 186 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 187 if (SOPKOpc == -1) 188 return; 189 190 // eq/ne is special because the imm16 can be treated as signed or unsigned, 191 // and initially selectd to the unsigned versions. 192 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 193 bool HasUImm; 194 if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { 195 if (!HasUImm) { 196 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 197 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 198 } 199 200 MI.setDesc(TII->get(SOPKOpc)); 201 } 202 203 return; 204 } 205 206 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 207 208 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || 209 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { 210 MI.setDesc(NewDesc); 211 } 212 } 213 214 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 215 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 216 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 217 /// XNOR (as a ^ b == ~(a ^ ~b)). 218 /// \returns true if the caller should continue the machine function iterator 219 static bool shrinkScalarLogicOp(const GCNSubtarget &ST, 220 MachineRegisterInfo &MRI, 221 const SIInstrInfo *TII, 222 MachineInstr &MI) { 223 unsigned Opc = MI.getOpcode(); 224 const MachineOperand *Dest = &MI.getOperand(0); 225 MachineOperand *Src0 = &MI.getOperand(1); 226 MachineOperand *Src1 = &MI.getOperand(2); 227 MachineOperand *SrcReg = Src0; 228 MachineOperand *SrcImm = Src1; 229 230 if (SrcImm->isImm() && 231 !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { 232 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 233 uint32_t NewImm = 0; 234 235 if (Opc == AMDGPU::S_AND_B32) { 236 if (isPowerOf2_32(~Imm)) { 237 NewImm = countTrailingOnes(Imm); 238 Opc = AMDGPU::S_BITSET0_B32; 239 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 240 NewImm = ~Imm; 241 Opc = AMDGPU::S_ANDN2_B32; 242 } 243 } else if (Opc == AMDGPU::S_OR_B32) { 244 if (isPowerOf2_32(Imm)) { 245 NewImm = countTrailingZeros(Imm); 246 Opc = AMDGPU::S_BITSET1_B32; 247 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 248 NewImm = ~Imm; 249 Opc = AMDGPU::S_ORN2_B32; 250 } 251 } else if (Opc == AMDGPU::S_XOR_B32) { 252 if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 253 NewImm = ~Imm; 254 Opc = AMDGPU::S_XNOR_B32; 255 } 256 } else { 257 llvm_unreachable("unexpected opcode"); 258 } 259 260 if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && 261 SrcImm == Src0) { 262 if (!TII->commuteInstruction(MI, false, 1, 2)) 263 NewImm = 0; 264 } 265 266 if (NewImm != 0) { 267 if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && 268 SrcReg->isReg()) { 269 MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 270 MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 271 return true; 272 } 273 274 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 275 MI.setDesc(TII->get(Opc)); 276 if (Opc == AMDGPU::S_BITSET0_B32 || 277 Opc == AMDGPU::S_BITSET1_B32) { 278 Src0->ChangeToImmediate(NewImm); 279 // Remove the immediate and add the tied input. 280 MI.getOperand(2).ChangeToRegister(Dest->getReg(), false); 281 MI.tieOperands(0, 2); 282 } else { 283 SrcImm->setImm(NewImm); 284 } 285 } 286 } 287 } 288 289 return false; 290 } 291 292 // This is the same as MachineInstr::readsRegister/modifiesRegister except 293 // it takes subregs into account. 294 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 295 unsigned Reg, unsigned SubReg, 296 const SIRegisterInfo &TRI) { 297 for (const MachineOperand &MO : R) { 298 if (!MO.isReg()) 299 continue; 300 301 if (TargetRegisterInfo::isPhysicalRegister(Reg) && 302 TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { 303 if (TRI.regsOverlap(Reg, MO.getReg())) 304 return true; 305 } else if (MO.getReg() == Reg && 306 TargetRegisterInfo::isVirtualRegister(Reg)) { 307 LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & 308 TRI.getSubRegIndexLaneMask(MO.getSubReg()); 309 if (Overlap.any()) 310 return true; 311 } 312 } 313 return false; 314 } 315 316 static bool instReadsReg(const MachineInstr *MI, 317 unsigned Reg, unsigned SubReg, 318 const SIRegisterInfo &TRI) { 319 return instAccessReg(MI->uses(), Reg, SubReg, TRI); 320 } 321 322 static bool instModifiesReg(const MachineInstr *MI, 323 unsigned Reg, unsigned SubReg, 324 const SIRegisterInfo &TRI) { 325 return instAccessReg(MI->defs(), Reg, SubReg, TRI); 326 } 327 328 static TargetInstrInfo::RegSubRegPair 329 getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, 330 const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { 331 if (TRI.getRegSizeInBits(Reg, MRI) != 32) { 332 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 333 Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); 334 } else { 335 LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); 336 Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger())); 337 } 338 } 339 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 340 } 341 342 // Match: 343 // mov t, x 344 // mov x, y 345 // mov y, t 346 // 347 // => 348 // 349 // mov t, x (t is potentially dead and move eliminated) 350 // v_swap_b32 x, y 351 // 352 // Returns next valid instruction pointer if was able to create v_swap_b32. 353 // 354 // This shall not be done too early not to prevent possible folding which may 355 // remove matched moves, and this should prefereably be done before RA to 356 // release saved registers and also possibly after RA which can insert copies 357 // too. 358 // 359 // This is really just a generic peephole that is not a canocical shrinking, 360 // although requirements match the pass placement and it reduces code size too. 361 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, 362 const SIInstrInfo *TII) { 363 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 364 MovT.getOpcode() == AMDGPU::COPY); 365 366 unsigned T = MovT.getOperand(0).getReg(); 367 unsigned Tsub = MovT.getOperand(0).getSubReg(); 368 MachineOperand &Xop = MovT.getOperand(1); 369 370 if (!Xop.isReg()) 371 return nullptr; 372 unsigned X = Xop.getReg(); 373 unsigned Xsub = Xop.getSubReg(); 374 375 unsigned Size = TII->getOpSize(MovT, 0) / 4; 376 377 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 378 if (!TRI.isVGPR(MRI, X)) 379 return nullptr; 380 381 for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) { 382 if (YTop.getSubReg() != Tsub) 383 continue; 384 385 MachineInstr &MovY = *YTop.getParent(); 386 if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 && 387 MovY.getOpcode() != AMDGPU::COPY) || 388 MovY.getOperand(1).getSubReg() != Tsub) 389 continue; 390 391 unsigned Y = MovY.getOperand(0).getReg(); 392 unsigned Ysub = MovY.getOperand(0).getSubReg(); 393 394 if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) 395 continue; 396 397 MachineInstr *MovX = nullptr; 398 auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end(); 399 for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) { 400 if (instReadsReg(&*I, X, Xsub, TRI) || 401 instModifiesReg(&*I, Y, Ysub, TRI) || 402 instModifiesReg(&*I, T, Tsub, TRI) || 403 (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { 404 MovX = nullptr; 405 break; 406 } 407 if (!instReadsReg(&*I, Y, Ysub, TRI)) { 408 if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { 409 MovX = nullptr; 410 break; 411 } 412 continue; 413 } 414 if (MovX || 415 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 416 I->getOpcode() != AMDGPU::COPY) || 417 I->getOperand(0).getReg() != X || 418 I->getOperand(0).getSubReg() != Xsub) { 419 MovX = nullptr; 420 break; 421 } 422 MovX = &*I; 423 } 424 425 if (!MovX || I == E) 426 continue; 427 428 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY); 429 430 for (unsigned I = 0; I < Size; ++I) { 431 TargetInstrInfo::RegSubRegPair X1, Y1; 432 X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); 433 Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); 434 BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(), 435 TII->get(AMDGPU::V_SWAP_B32)) 436 .addDef(X1.Reg, 0, X1.SubReg) 437 .addDef(Y1.Reg, 0, Y1.SubReg) 438 .addReg(Y1.Reg, 0, Y1.SubReg) 439 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 440 } 441 MovX->eraseFromParent(); 442 MovY.eraseFromParent(); 443 MachineInstr *Next = &*std::next(MovT.getIterator()); 444 if (MRI.use_nodbg_empty(T)) 445 MovT.eraseFromParent(); 446 else 447 Xop.setIsKill(false); 448 449 return Next; 450 } 451 452 return nullptr; 453 } 454 455 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 456 if (skipFunction(MF.getFunction())) 457 return false; 458 459 MachineRegisterInfo &MRI = MF.getRegInfo(); 460 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 461 const SIInstrInfo *TII = ST.getInstrInfo(); 462 463 std::vector<unsigned> I1Defs; 464 465 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 466 BI != BE; ++BI) { 467 468 MachineBasicBlock &MBB = *BI; 469 MachineBasicBlock::iterator I, Next; 470 for (I = MBB.begin(); I != MBB.end(); I = Next) { 471 Next = std::next(I); 472 MachineInstr &MI = *I; 473 474 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 475 // If this has a literal constant source that is the same as the 476 // reversed bits of an inline immediate, replace with a bitreverse of 477 // that constant. This saves 4 bytes in the common case of materializing 478 // sign bits. 479 480 // Test if we are after regalloc. We only want to do this after any 481 // optimizations happen because this will confuse them. 482 // XXX - not exactly a check for post-regalloc run. 483 MachineOperand &Src = MI.getOperand(1); 484 if (Src.isImm() && 485 TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { 486 int32_t ReverseImm; 487 if (isReverseInlineImm(TII, Src, ReverseImm)) { 488 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 489 Src.setImm(ReverseImm); 490 continue; 491 } 492 } 493 } 494 495 if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 496 MI.getOpcode() == AMDGPU::COPY)) { 497 if (auto *NextMI = matchSwap(MI, MRI, TII)) { 498 Next = NextMI->getIterator(); 499 continue; 500 } 501 } 502 503 // Combine adjacent s_nops to use the immediate operand encoding how long 504 // to wait. 505 // 506 // s_nop N 507 // s_nop M 508 // => 509 // s_nop (N + M) 510 if (MI.getOpcode() == AMDGPU::S_NOP && 511 Next != MBB.end() && 512 (*Next).getOpcode() == AMDGPU::S_NOP) { 513 514 MachineInstr &NextMI = *Next; 515 // The instruction encodes the amount to wait with an offset of 1, 516 // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back 517 // after adding. 518 uint8_t Nop0 = MI.getOperand(0).getImm() + 1; 519 uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; 520 521 // Make sure we don't overflow the bounds. 522 if (Nop0 + Nop1 <= 8) { 523 NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); 524 MI.eraseFromParent(); 525 } 526 527 continue; 528 } 529 530 // FIXME: We also need to consider movs of constant operands since 531 // immediate operands are not folded if they have more than one use, and 532 // the operand folding pass is unaware if the immediate will be free since 533 // it won't know if the src == dest constraint will end up being 534 // satisfied. 535 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 536 MI.getOpcode() == AMDGPU::S_MUL_I32) { 537 const MachineOperand *Dest = &MI.getOperand(0); 538 MachineOperand *Src0 = &MI.getOperand(1); 539 MachineOperand *Src1 = &MI.getOperand(2); 540 541 if (!Src0->isReg() && Src1->isReg()) { 542 if (TII->commuteInstruction(MI, false, 1, 2)) 543 std::swap(Src0, Src1); 544 } 545 546 // FIXME: This could work better if hints worked with subregisters. If 547 // we have a vector add of a constant, we usually don't get the correct 548 // allocation due to the subregister usage. 549 if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && 550 Src0->isReg()) { 551 MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 552 MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 553 continue; 554 } 555 556 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 557 if (Src1->isImm() && isKImmOperand(TII, *Src1)) { 558 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 559 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 560 561 MI.setDesc(TII->get(Opc)); 562 MI.tieOperands(0, 1); 563 } 564 } 565 } 566 567 // Try to use s_cmpk_* 568 if (MI.isCompare() && TII->isSOPC(MI)) { 569 shrinkScalarCompare(TII, MI); 570 continue; 571 } 572 573 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 574 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 575 const MachineOperand &Dst = MI.getOperand(0); 576 MachineOperand &Src = MI.getOperand(1); 577 578 if (Src.isImm() && 579 TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) { 580 int32_t ReverseImm; 581 if (isKImmOperand(TII, Src)) 582 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 583 else if (isReverseInlineImm(TII, Src, ReverseImm)) { 584 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 585 Src.setImm(ReverseImm); 586 } 587 } 588 589 continue; 590 } 591 592 // Shrink scalar logic operations. 593 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 594 MI.getOpcode() == AMDGPU::S_OR_B32 || 595 MI.getOpcode() == AMDGPU::S_XOR_B32) { 596 if (shrinkScalarLogicOp(ST, MRI, TII, MI)) 597 continue; 598 } 599 600 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 601 continue; 602 603 if (!TII->canShrink(MI, MRI)) { 604 // Try commuting the instruction and see if that enables us to shrink 605 // it. 606 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 607 !TII->canShrink(MI, MRI)) 608 continue; 609 } 610 611 // getVOPe32 could be -1 here if we started with an instruction that had 612 // a 32-bit encoding and then commuted it to an instruction that did not. 613 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 614 continue; 615 616 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 617 618 if (TII->isVOPC(Op32)) { 619 unsigned DstReg = MI.getOperand(0).getReg(); 620 if (TargetRegisterInfo::isVirtualRegister(DstReg)) { 621 // VOPC instructions can only write to the VCC register. We can't 622 // force them to use VCC here, because this is only one register and 623 // cannot deal with sequences which would require multiple copies of 624 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 625 // 626 // So, instead of forcing the instruction to write to VCC, we provide 627 // a hint to the register allocator to use VCC and then we will run 628 // this pass again after RA and shrink it if it outputs to VCC. 629 MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); 630 continue; 631 } 632 if (DstReg != AMDGPU::VCC) 633 continue; 634 } 635 636 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 637 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 638 // instructions. 639 const MachineOperand *Src2 = 640 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 641 if (!Src2->isReg()) 642 continue; 643 unsigned SReg = Src2->getReg(); 644 if (TargetRegisterInfo::isVirtualRegister(SReg)) { 645 MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); 646 continue; 647 } 648 if (SReg != AMDGPU::VCC) 649 continue; 650 } 651 652 // Check for the bool flag output for instructions like V_ADD_I32_e64. 653 const MachineOperand *SDst = TII->getNamedOperand(MI, 654 AMDGPU::OpName::sdst); 655 656 // Check the carry-in operand for v_addc_u32_e64. 657 const MachineOperand *Src2 = TII->getNamedOperand(MI, 658 AMDGPU::OpName::src2); 659 660 if (SDst) { 661 if (SDst->getReg() != AMDGPU::VCC) { 662 if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) 663 MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); 664 continue; 665 } 666 667 // All of the instructions with carry outs also have an SGPR input in 668 // src2. 669 if (Src2 && Src2->getReg() != AMDGPU::VCC) { 670 if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) 671 MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); 672 673 continue; 674 } 675 } 676 677 // We can shrink this instruction 678 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 679 680 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 681 ++NumInstructionsShrunk; 682 683 // Copy extra operands not present in the instruction definition. 684 copyExtraImplicitOps(*Inst32, MF, MI); 685 686 MI.eraseFromParent(); 687 foldImmediates(*Inst32, TII, MRI); 688 689 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 690 } 691 } 692 return false; 693 } 694