1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "AMDGPUSubtarget.h" 13 #include "SIInstrInfo.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/IR/Constants.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/LLVMContext.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/raw_ostream.h" 24 #include "llvm/Target/TargetMachine.h" 25 26 #define DEBUG_TYPE "si-shrink-instructions" 27 28 STATISTIC(NumInstructionsShrunk, 29 "Number of 64-bit instruction reduced to 32-bit."); 30 STATISTIC(NumLiteralConstantsFolded, 31 "Number of literal constants folded into 32-bit instructions."); 32 33 using namespace llvm; 34 35 namespace { 36 37 class SIShrinkInstructions : public MachineFunctionPass { 38 public: 39 static char ID; 40 41 void shrinkMIMG(MachineInstr &MI); 42 43 public: 44 SIShrinkInstructions() : MachineFunctionPass(ID) { 45 } 46 47 bool runOnMachineFunction(MachineFunction &MF) override; 48 49 StringRef getPassName() const override { return "SI Shrink Instructions"; } 50 51 void getAnalysisUsage(AnalysisUsage &AU) const override { 52 AU.setPreservesCFG(); 53 MachineFunctionPass::getAnalysisUsage(AU); 54 } 55 }; 56 57 } // End anonymous namespace. 58 59 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 60 "SI Shrink Instructions", false, false) 61 62 char SIShrinkInstructions::ID = 0; 63 64 FunctionPass *llvm::createSIShrinkInstructionsPass() { 65 return new SIShrinkInstructions(); 66 } 67 68 /// This function checks \p MI for operands defined by a move immediate 69 /// instruction and then folds the literal constant into the instruction if it 70 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 71 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, 72 MachineRegisterInfo &MRI, bool TryToCommute = true) { 73 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 74 75 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 76 77 // Try to fold Src0 78 MachineOperand &Src0 = MI.getOperand(Src0Idx); 79 if (Src0.isReg()) { 80 unsigned Reg = Src0.getReg(); 81 if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { 82 MachineInstr *Def = MRI.getUniqueVRegDef(Reg); 83 if (Def && Def->isMoveImmediate()) { 84 MachineOperand &MovSrc = Def->getOperand(1); 85 bool ConstantFolded = false; 86 87 if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || 88 isUInt<32>(MovSrc.getImm()))) { 89 // It's possible to have only one component of a super-reg defined by 90 // a single mov, so we need to clear any subregister flag. 91 Src0.setSubReg(0); 92 Src0.ChangeToImmediate(MovSrc.getImm()); 93 ConstantFolded = true; 94 } else if (MovSrc.isFI()) { 95 Src0.setSubReg(0); 96 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 97 ConstantFolded = true; 98 } 99 100 if (ConstantFolded) { 101 assert(MRI.use_empty(Reg)); 102 Def->eraseFromParent(); 103 ++NumLiteralConstantsFolded; 104 return true; 105 } 106 } 107 } 108 } 109 110 // We have failed to fold src0, so commute the instruction and try again. 111 if (TryToCommute && MI.isCommutable()) { 112 if (TII->commuteInstruction(MI)) { 113 if (foldImmediates(MI, TII, MRI, false)) 114 return true; 115 116 // Commute back. 117 TII->commuteInstruction(MI); 118 } 119 } 120 121 return false; 122 } 123 124 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 125 return isInt<16>(Src.getImm()) && 126 !TII->isInlineConstant(*Src.getParent(), 127 Src.getParent()->getOperandNo(&Src)); 128 } 129 130 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 131 return isUInt<16>(Src.getImm()) && 132 !TII->isInlineConstant(*Src.getParent(), 133 Src.getParent()->getOperandNo(&Src)); 134 } 135 136 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, 137 const MachineOperand &Src, 138 bool &IsUnsigned) { 139 if (isInt<16>(Src.getImm())) { 140 IsUnsigned = false; 141 return !TII->isInlineConstant(Src); 142 } 143 144 if (isUInt<16>(Src.getImm())) { 145 IsUnsigned = true; 146 return !TII->isInlineConstant(Src); 147 } 148 149 return false; 150 } 151 152 /// \returns true if the constant in \p Src should be replaced with a bitreverse 153 /// of an inline immediate. 154 static bool isReverseInlineImm(const SIInstrInfo *TII, 155 const MachineOperand &Src, 156 int32_t &ReverseImm) { 157 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 158 return false; 159 160 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 161 return ReverseImm >= -16 && ReverseImm <= 64; 162 } 163 164 /// Copy implicit register operands from specified instruction to this 165 /// instruction that are not part of the instruction definition. 166 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, 167 const MachineInstr &MI) { 168 for (unsigned i = MI.getDesc().getNumOperands() + 169 MI.getDesc().getNumImplicitUses() + 170 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 171 i != e; ++i) { 172 const MachineOperand &MO = MI.getOperand(i); 173 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 174 NewMI.addOperand(MF, MO); 175 } 176 } 177 178 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { 179 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 180 // get constants on the RHS. 181 if (!MI.getOperand(0).isReg()) 182 TII->commuteInstruction(MI, false, 0, 1); 183 184 const MachineOperand &Src1 = MI.getOperand(1); 185 if (!Src1.isImm()) 186 return; 187 188 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 189 if (SOPKOpc == -1) 190 return; 191 192 // eq/ne is special because the imm16 can be treated as signed or unsigned, 193 // and initially selectd to the unsigned versions. 194 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 195 bool HasUImm; 196 if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { 197 if (!HasUImm) { 198 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 199 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 200 } 201 202 MI.setDesc(TII->get(SOPKOpc)); 203 } 204 205 return; 206 } 207 208 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 209 210 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || 211 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { 212 MI.setDesc(NewDesc); 213 } 214 } 215 216 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 217 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { 218 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 219 if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 220 return; 221 222 MachineFunction *MF = MI.getParent()->getParent(); 223 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 224 const SIInstrInfo *TII = ST.getInstrInfo(); 225 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 226 int VAddr0Idx = 227 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 228 unsigned NewAddrDwords = Info->VAddrDwords; 229 const TargetRegisterClass *RC; 230 231 if (Info->VAddrDwords == 2) { 232 RC = &AMDGPU::VReg_64RegClass; 233 } else if (Info->VAddrDwords == 3) { 234 RC = &AMDGPU::VReg_96RegClass; 235 } else if (Info->VAddrDwords == 4) { 236 RC = &AMDGPU::VReg_128RegClass; 237 } else if (Info->VAddrDwords <= 8) { 238 RC = &AMDGPU::VReg_256RegClass; 239 NewAddrDwords = 8; 240 } else { 241 RC = &AMDGPU::VReg_512RegClass; 242 NewAddrDwords = 16; 243 } 244 245 unsigned VgprBase = 0; 246 bool IsUndef = true; 247 bool IsKill = NewAddrDwords == Info->VAddrDwords; 248 for (unsigned i = 0; i < Info->VAddrDwords; ++i) { 249 const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); 250 unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); 251 252 if (i == 0) { 253 VgprBase = Vgpr; 254 } else if (VgprBase + i != Vgpr) 255 return; 256 257 if (!Op.isUndef()) 258 IsUndef = false; 259 if (!Op.isKill()) 260 IsKill = false; 261 } 262 263 if (VgprBase + NewAddrDwords > 256) 264 return; 265 266 // Further check for implicit tied operands - this may be present if TFE is 267 // enabled 268 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 269 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 270 unsigned TFEVal = MI.getOperand(TFEIdx).getImm(); 271 unsigned LWEVal = MI.getOperand(LWEIdx).getImm(); 272 int ToUntie = -1; 273 if (TFEVal || LWEVal) { 274 // TFE/LWE is enabled so we need to deal with an implicit tied operand 275 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 276 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 277 MI.getOperand(i).isImplicit()) { 278 // This is the tied operand 279 assert( 280 ToUntie == -1 && 281 "found more than one tied implicit operand when expecting only 1"); 282 ToUntie = i; 283 MI.untieRegOperand(ToUntie); 284 } 285 } 286 } 287 288 unsigned NewOpcode = 289 AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, 290 Info->VDataDwords, NewAddrDwords); 291 MI.setDesc(TII->get(NewOpcode)); 292 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 293 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 294 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 295 296 for (unsigned i = 1; i < Info->VAddrDwords; ++i) 297 MI.RemoveOperand(VAddr0Idx + 1); 298 299 if (ToUntie >= 0) { 300 MI.tieOperands( 301 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 302 ToUntie - (Info->VAddrDwords - 1)); 303 } 304 } 305 306 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 307 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 308 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 309 /// XNOR (as a ^ b == ~(a ^ ~b)). 310 /// \returns true if the caller should continue the machine function iterator 311 static bool shrinkScalarLogicOp(const GCNSubtarget &ST, 312 MachineRegisterInfo &MRI, 313 const SIInstrInfo *TII, 314 MachineInstr &MI) { 315 unsigned Opc = MI.getOpcode(); 316 const MachineOperand *Dest = &MI.getOperand(0); 317 MachineOperand *Src0 = &MI.getOperand(1); 318 MachineOperand *Src1 = &MI.getOperand(2); 319 MachineOperand *SrcReg = Src0; 320 MachineOperand *SrcImm = Src1; 321 322 if (SrcImm->isImm() && 323 !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { 324 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 325 uint32_t NewImm = 0; 326 327 if (Opc == AMDGPU::S_AND_B32) { 328 if (isPowerOf2_32(~Imm)) { 329 NewImm = countTrailingOnes(Imm); 330 Opc = AMDGPU::S_BITSET0_B32; 331 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 332 NewImm = ~Imm; 333 Opc = AMDGPU::S_ANDN2_B32; 334 } 335 } else if (Opc == AMDGPU::S_OR_B32) { 336 if (isPowerOf2_32(Imm)) { 337 NewImm = countTrailingZeros(Imm); 338 Opc = AMDGPU::S_BITSET1_B32; 339 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 340 NewImm = ~Imm; 341 Opc = AMDGPU::S_ORN2_B32; 342 } 343 } else if (Opc == AMDGPU::S_XOR_B32) { 344 if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 345 NewImm = ~Imm; 346 Opc = AMDGPU::S_XNOR_B32; 347 } 348 } else { 349 llvm_unreachable("unexpected opcode"); 350 } 351 352 if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && 353 SrcImm == Src0) { 354 if (!TII->commuteInstruction(MI, false, 1, 2)) 355 NewImm = 0; 356 } 357 358 if (NewImm != 0) { 359 if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && 360 SrcReg->isReg()) { 361 MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 362 MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 363 return true; 364 } 365 366 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 367 MI.setDesc(TII->get(Opc)); 368 if (Opc == AMDGPU::S_BITSET0_B32 || 369 Opc == AMDGPU::S_BITSET1_B32) { 370 Src0->ChangeToImmediate(NewImm); 371 // Remove the immediate and add the tied input. 372 MI.getOperand(2).ChangeToRegister(Dest->getReg(), false); 373 MI.tieOperands(0, 2); 374 } else { 375 SrcImm->setImm(NewImm); 376 } 377 } 378 } 379 } 380 381 return false; 382 } 383 384 // This is the same as MachineInstr::readsRegister/modifiesRegister except 385 // it takes subregs into account. 386 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 387 unsigned Reg, unsigned SubReg, 388 const SIRegisterInfo &TRI) { 389 for (const MachineOperand &MO : R) { 390 if (!MO.isReg()) 391 continue; 392 393 if (TargetRegisterInfo::isPhysicalRegister(Reg) && 394 TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { 395 if (TRI.regsOverlap(Reg, MO.getReg())) 396 return true; 397 } else if (MO.getReg() == Reg && 398 TargetRegisterInfo::isVirtualRegister(Reg)) { 399 LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & 400 TRI.getSubRegIndexLaneMask(MO.getSubReg()); 401 if (Overlap.any()) 402 return true; 403 } 404 } 405 return false; 406 } 407 408 static bool instReadsReg(const MachineInstr *MI, 409 unsigned Reg, unsigned SubReg, 410 const SIRegisterInfo &TRI) { 411 return instAccessReg(MI->uses(), Reg, SubReg, TRI); 412 } 413 414 static bool instModifiesReg(const MachineInstr *MI, 415 unsigned Reg, unsigned SubReg, 416 const SIRegisterInfo &TRI) { 417 return instAccessReg(MI->defs(), Reg, SubReg, TRI); 418 } 419 420 static TargetInstrInfo::RegSubRegPair 421 getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, 422 const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { 423 if (TRI.getRegSizeInBits(Reg, MRI) != 32) { 424 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 425 Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); 426 } else { 427 LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); 428 Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger())); 429 } 430 } 431 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 432 } 433 434 // Match: 435 // mov t, x 436 // mov x, y 437 // mov y, t 438 // 439 // => 440 // 441 // mov t, x (t is potentially dead and move eliminated) 442 // v_swap_b32 x, y 443 // 444 // Returns next valid instruction pointer if was able to create v_swap_b32. 445 // 446 // This shall not be done too early not to prevent possible folding which may 447 // remove matched moves, and this should prefereably be done before RA to 448 // release saved registers and also possibly after RA which can insert copies 449 // too. 450 // 451 // This is really just a generic peephole that is not a canocical shrinking, 452 // although requirements match the pass placement and it reduces code size too. 453 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, 454 const SIInstrInfo *TII) { 455 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 456 MovT.getOpcode() == AMDGPU::COPY); 457 458 unsigned T = MovT.getOperand(0).getReg(); 459 unsigned Tsub = MovT.getOperand(0).getSubReg(); 460 MachineOperand &Xop = MovT.getOperand(1); 461 462 if (!Xop.isReg()) 463 return nullptr; 464 unsigned X = Xop.getReg(); 465 unsigned Xsub = Xop.getSubReg(); 466 467 unsigned Size = TII->getOpSize(MovT, 0) / 4; 468 469 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 470 if (!TRI.isVGPR(MRI, X)) 471 return nullptr; 472 473 for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) { 474 if (YTop.getSubReg() != Tsub) 475 continue; 476 477 MachineInstr &MovY = *YTop.getParent(); 478 if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 && 479 MovY.getOpcode() != AMDGPU::COPY) || 480 MovY.getOperand(1).getSubReg() != Tsub) 481 continue; 482 483 unsigned Y = MovY.getOperand(0).getReg(); 484 unsigned Ysub = MovY.getOperand(0).getSubReg(); 485 486 if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) 487 continue; 488 489 MachineInstr *MovX = nullptr; 490 auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end(); 491 for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) { 492 if (instReadsReg(&*I, X, Xsub, TRI) || 493 instModifiesReg(&*I, Y, Ysub, TRI) || 494 instModifiesReg(&*I, T, Tsub, TRI) || 495 (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { 496 MovX = nullptr; 497 break; 498 } 499 if (!instReadsReg(&*I, Y, Ysub, TRI)) { 500 if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { 501 MovX = nullptr; 502 break; 503 } 504 continue; 505 } 506 if (MovX || 507 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 508 I->getOpcode() != AMDGPU::COPY) || 509 I->getOperand(0).getReg() != X || 510 I->getOperand(0).getSubReg() != Xsub) { 511 MovX = nullptr; 512 break; 513 } 514 MovX = &*I; 515 } 516 517 if (!MovX || I == E) 518 continue; 519 520 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY); 521 522 for (unsigned I = 0; I < Size; ++I) { 523 TargetInstrInfo::RegSubRegPair X1, Y1; 524 X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); 525 Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); 526 BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(), 527 TII->get(AMDGPU::V_SWAP_B32)) 528 .addDef(X1.Reg, 0, X1.SubReg) 529 .addDef(Y1.Reg, 0, Y1.SubReg) 530 .addReg(Y1.Reg, 0, Y1.SubReg) 531 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 532 } 533 MovX->eraseFromParent(); 534 MovY.eraseFromParent(); 535 MachineInstr *Next = &*std::next(MovT.getIterator()); 536 if (MRI.use_nodbg_empty(T)) 537 MovT.eraseFromParent(); 538 else 539 Xop.setIsKill(false); 540 541 return Next; 542 } 543 544 return nullptr; 545 } 546 547 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 548 if (skipFunction(MF.getFunction())) 549 return false; 550 551 MachineRegisterInfo &MRI = MF.getRegInfo(); 552 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 553 const SIInstrInfo *TII = ST.getInstrInfo(); 554 555 std::vector<unsigned> I1Defs; 556 557 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 558 BI != BE; ++BI) { 559 560 MachineBasicBlock &MBB = *BI; 561 MachineBasicBlock::iterator I, Next; 562 for (I = MBB.begin(); I != MBB.end(); I = Next) { 563 Next = std::next(I); 564 MachineInstr &MI = *I; 565 566 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 567 // If this has a literal constant source that is the same as the 568 // reversed bits of an inline immediate, replace with a bitreverse of 569 // that constant. This saves 4 bytes in the common case of materializing 570 // sign bits. 571 572 // Test if we are after regalloc. We only want to do this after any 573 // optimizations happen because this will confuse them. 574 // XXX - not exactly a check for post-regalloc run. 575 MachineOperand &Src = MI.getOperand(1); 576 if (Src.isImm() && 577 TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { 578 int32_t ReverseImm; 579 if (isReverseInlineImm(TII, Src, ReverseImm)) { 580 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 581 Src.setImm(ReverseImm); 582 continue; 583 } 584 } 585 } 586 587 if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 588 MI.getOpcode() == AMDGPU::COPY)) { 589 if (auto *NextMI = matchSwap(MI, MRI, TII)) { 590 Next = NextMI->getIterator(); 591 continue; 592 } 593 } 594 595 // Combine adjacent s_nops to use the immediate operand encoding how long 596 // to wait. 597 // 598 // s_nop N 599 // s_nop M 600 // => 601 // s_nop (N + M) 602 if (MI.getOpcode() == AMDGPU::S_NOP && 603 Next != MBB.end() && 604 (*Next).getOpcode() == AMDGPU::S_NOP) { 605 606 MachineInstr &NextMI = *Next; 607 // The instruction encodes the amount to wait with an offset of 1, 608 // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back 609 // after adding. 610 uint8_t Nop0 = MI.getOperand(0).getImm() + 1; 611 uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; 612 613 // Make sure we don't overflow the bounds. 614 if (Nop0 + Nop1 <= 8) { 615 NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); 616 MI.eraseFromParent(); 617 } 618 619 continue; 620 } 621 622 // FIXME: We also need to consider movs of constant operands since 623 // immediate operands are not folded if they have more than one use, and 624 // the operand folding pass is unaware if the immediate will be free since 625 // it won't know if the src == dest constraint will end up being 626 // satisfied. 627 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 628 MI.getOpcode() == AMDGPU::S_MUL_I32) { 629 const MachineOperand *Dest = &MI.getOperand(0); 630 MachineOperand *Src0 = &MI.getOperand(1); 631 MachineOperand *Src1 = &MI.getOperand(2); 632 633 if (!Src0->isReg() && Src1->isReg()) { 634 if (TII->commuteInstruction(MI, false, 1, 2)) 635 std::swap(Src0, Src1); 636 } 637 638 // FIXME: This could work better if hints worked with subregisters. If 639 // we have a vector add of a constant, we usually don't get the correct 640 // allocation due to the subregister usage. 641 if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && 642 Src0->isReg()) { 643 MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 644 MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 645 continue; 646 } 647 648 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 649 if (Src1->isImm() && isKImmOperand(TII, *Src1)) { 650 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 651 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 652 653 MI.setDesc(TII->get(Opc)); 654 MI.tieOperands(0, 1); 655 } 656 } 657 } 658 659 // Try to use s_cmpk_* 660 if (MI.isCompare() && TII->isSOPC(MI)) { 661 shrinkScalarCompare(TII, MI); 662 continue; 663 } 664 665 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 666 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 667 const MachineOperand &Dst = MI.getOperand(0); 668 MachineOperand &Src = MI.getOperand(1); 669 670 if (Src.isImm() && 671 TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) { 672 int32_t ReverseImm; 673 if (isKImmOperand(TII, Src)) 674 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 675 else if (isReverseInlineImm(TII, Src, ReverseImm)) { 676 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 677 Src.setImm(ReverseImm); 678 } 679 } 680 681 continue; 682 } 683 684 // Shrink scalar logic operations. 685 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 686 MI.getOpcode() == AMDGPU::S_OR_B32 || 687 MI.getOpcode() == AMDGPU::S_XOR_B32) { 688 if (shrinkScalarLogicOp(ST, MRI, TII, MI)) 689 continue; 690 } 691 692 if (TII->isMIMG(MI.getOpcode()) && 693 ST.getGeneration() >= AMDGPUSubtarget::GFX10 && 694 MF.getProperties().hasProperty( 695 MachineFunctionProperties::Property::NoVRegs)) { 696 shrinkMIMG(MI); 697 continue; 698 } 699 700 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 701 continue; 702 703 if (!TII->canShrink(MI, MRI)) { 704 // Try commuting the instruction and see if that enables us to shrink 705 // it. 706 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 707 !TII->canShrink(MI, MRI)) 708 continue; 709 } 710 711 // getVOPe32 could be -1 here if we started with an instruction that had 712 // a 32-bit encoding and then commuted it to an instruction that did not. 713 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 714 continue; 715 716 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 717 718 if (TII->isVOPC(Op32)) { 719 unsigned DstReg = MI.getOperand(0).getReg(); 720 if (TargetRegisterInfo::isVirtualRegister(DstReg)) { 721 // VOPC instructions can only write to the VCC register. We can't 722 // force them to use VCC here, because this is only one register and 723 // cannot deal with sequences which would require multiple copies of 724 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 725 // 726 // So, instead of forcing the instruction to write to VCC, we provide 727 // a hint to the register allocator to use VCC and then we will run 728 // this pass again after RA and shrink it if it outputs to VCC. 729 MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); 730 continue; 731 } 732 if (DstReg != AMDGPU::VCC) 733 continue; 734 } 735 736 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 737 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 738 // instructions. 739 const MachineOperand *Src2 = 740 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 741 if (!Src2->isReg()) 742 continue; 743 unsigned SReg = Src2->getReg(); 744 if (TargetRegisterInfo::isVirtualRegister(SReg)) { 745 MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); 746 continue; 747 } 748 if (SReg != AMDGPU::VCC) 749 continue; 750 } 751 752 // Check for the bool flag output for instructions like V_ADD_I32_e64. 753 const MachineOperand *SDst = TII->getNamedOperand(MI, 754 AMDGPU::OpName::sdst); 755 756 // Check the carry-in operand for v_addc_u32_e64. 757 const MachineOperand *Src2 = TII->getNamedOperand(MI, 758 AMDGPU::OpName::src2); 759 760 if (SDst) { 761 if (SDst->getReg() != AMDGPU::VCC) { 762 if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) 763 MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); 764 continue; 765 } 766 767 // All of the instructions with carry outs also have an SGPR input in 768 // src2. 769 if (Src2 && Src2->getReg() != AMDGPU::VCC) { 770 if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) 771 MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); 772 773 continue; 774 } 775 } 776 777 // We can shrink this instruction 778 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 779 780 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 781 ++NumInstructionsShrunk; 782 783 // Copy extra operands not present in the instruction definition. 784 copyExtraImplicitOps(*Inst32, MF, MI); 785 786 MI.eraseFromParent(); 787 foldImmediates(*Inst32, TII, MRI); 788 789 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 790 } 791 } 792 return false; 793 } 794