1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "AMDGPUSubtarget.h" 13 #include "SIInstrInfo.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/IR/Constants.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/LLVMContext.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/raw_ostream.h" 24 #include "llvm/Target/TargetMachine.h" 25 26 #define DEBUG_TYPE "si-shrink-instructions" 27 28 STATISTIC(NumInstructionsShrunk, 29 "Number of 64-bit instruction reduced to 32-bit."); 30 STATISTIC(NumLiteralConstantsFolded, 31 "Number of literal constants folded into 32-bit instructions."); 32 33 using namespace llvm; 34 35 namespace { 36 37 class SIShrinkInstructions : public MachineFunctionPass { 38 public: 39 static char ID; 40 41 void shrinkMIMG(MachineInstr &MI); 42 43 public: 44 SIShrinkInstructions() : MachineFunctionPass(ID) { 45 } 46 47 bool runOnMachineFunction(MachineFunction &MF) override; 48 49 StringRef getPassName() const override { return "SI Shrink Instructions"; } 50 51 void getAnalysisUsage(AnalysisUsage &AU) const override { 52 AU.setPreservesCFG(); 53 MachineFunctionPass::getAnalysisUsage(AU); 54 } 55 }; 56 57 } // End anonymous namespace. 58 59 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 60 "SI Shrink Instructions", false, false) 61 62 char SIShrinkInstructions::ID = 0; 63 64 FunctionPass *llvm::createSIShrinkInstructionsPass() { 65 return new SIShrinkInstructions(); 66 } 67 68 /// This function checks \p MI for operands defined by a move immediate 69 /// instruction and then folds the literal constant into the instruction if it 70 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 71 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, 72 MachineRegisterInfo &MRI, bool TryToCommute = true) { 73 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 74 75 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 76 77 // Try to fold Src0 78 MachineOperand &Src0 = MI.getOperand(Src0Idx); 79 if (Src0.isReg()) { 80 Register Reg = Src0.getReg(); 81 if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { 82 MachineInstr *Def = MRI.getUniqueVRegDef(Reg); 83 if (Def && Def->isMoveImmediate()) { 84 MachineOperand &MovSrc = Def->getOperand(1); 85 bool ConstantFolded = false; 86 87 if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || 88 isUInt<32>(MovSrc.getImm()))) { 89 // It's possible to have only one component of a super-reg defined by 90 // a single mov, so we need to clear any subregister flag. 91 Src0.setSubReg(0); 92 Src0.ChangeToImmediate(MovSrc.getImm()); 93 ConstantFolded = true; 94 } else if (MovSrc.isFI()) { 95 Src0.setSubReg(0); 96 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 97 ConstantFolded = true; 98 } else if (MovSrc.isGlobal()) { 99 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 100 MovSrc.getTargetFlags()); 101 ConstantFolded = true; 102 } 103 104 if (ConstantFolded) { 105 assert(MRI.use_empty(Reg)); 106 Def->eraseFromParent(); 107 ++NumLiteralConstantsFolded; 108 return true; 109 } 110 } 111 } 112 } 113 114 // We have failed to fold src0, so commute the instruction and try again. 115 if (TryToCommute && MI.isCommutable()) { 116 if (TII->commuteInstruction(MI)) { 117 if (foldImmediates(MI, TII, MRI, false)) 118 return true; 119 120 // Commute back. 121 TII->commuteInstruction(MI); 122 } 123 } 124 125 return false; 126 } 127 128 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 129 return isInt<16>(Src.getImm()) && 130 !TII->isInlineConstant(*Src.getParent(), 131 Src.getParent()->getOperandNo(&Src)); 132 } 133 134 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 135 return isUInt<16>(Src.getImm()) && 136 !TII->isInlineConstant(*Src.getParent(), 137 Src.getParent()->getOperandNo(&Src)); 138 } 139 140 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, 141 const MachineOperand &Src, 142 bool &IsUnsigned) { 143 if (isInt<16>(Src.getImm())) { 144 IsUnsigned = false; 145 return !TII->isInlineConstant(Src); 146 } 147 148 if (isUInt<16>(Src.getImm())) { 149 IsUnsigned = true; 150 return !TII->isInlineConstant(Src); 151 } 152 153 return false; 154 } 155 156 /// \returns true if the constant in \p Src should be replaced with a bitreverse 157 /// of an inline immediate. 158 static bool isReverseInlineImm(const SIInstrInfo *TII, 159 const MachineOperand &Src, 160 int32_t &ReverseImm) { 161 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 162 return false; 163 164 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 165 return ReverseImm >= -16 && ReverseImm <= 64; 166 } 167 168 /// Copy implicit register operands from specified instruction to this 169 /// instruction that are not part of the instruction definition. 170 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, 171 const MachineInstr &MI) { 172 for (unsigned i = MI.getDesc().getNumOperands() + 173 MI.getDesc().getNumImplicitUses() + 174 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 175 i != e; ++i) { 176 const MachineOperand &MO = MI.getOperand(i); 177 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 178 NewMI.addOperand(MF, MO); 179 } 180 } 181 182 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { 183 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 184 // get constants on the RHS. 185 if (!MI.getOperand(0).isReg()) 186 TII->commuteInstruction(MI, false, 0, 1); 187 188 // cmpk requires src0 to be a register 189 const MachineOperand &Src0 = MI.getOperand(0); 190 if (!Src0.isReg()) 191 return; 192 193 const MachineOperand &Src1 = MI.getOperand(1); 194 if (!Src1.isImm()) 195 return; 196 197 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 198 if (SOPKOpc == -1) 199 return; 200 201 // eq/ne is special because the imm16 can be treated as signed or unsigned, 202 // and initially selectd to the unsigned versions. 203 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 204 bool HasUImm; 205 if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { 206 if (!HasUImm) { 207 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 208 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 209 } 210 211 MI.setDesc(TII->get(SOPKOpc)); 212 } 213 214 return; 215 } 216 217 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 218 219 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || 220 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { 221 MI.setDesc(NewDesc); 222 } 223 } 224 225 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 226 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { 227 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 228 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 229 return; 230 231 MachineFunction *MF = MI.getParent()->getParent(); 232 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 233 const SIInstrInfo *TII = ST.getInstrInfo(); 234 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 235 int VAddr0Idx = 236 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 237 unsigned NewAddrDwords = Info->VAddrDwords; 238 const TargetRegisterClass *RC; 239 240 if (Info->VAddrDwords == 2) { 241 RC = &AMDGPU::VReg_64RegClass; 242 } else if (Info->VAddrDwords == 3) { 243 RC = &AMDGPU::VReg_96RegClass; 244 } else if (Info->VAddrDwords == 4) { 245 RC = &AMDGPU::VReg_128RegClass; 246 } else if (Info->VAddrDwords <= 8) { 247 RC = &AMDGPU::VReg_256RegClass; 248 NewAddrDwords = 8; 249 } else { 250 RC = &AMDGPU::VReg_512RegClass; 251 NewAddrDwords = 16; 252 } 253 254 unsigned VgprBase = 0; 255 bool IsUndef = true; 256 bool IsKill = NewAddrDwords == Info->VAddrDwords; 257 for (unsigned i = 0; i < Info->VAddrDwords; ++i) { 258 const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); 259 unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); 260 261 if (i == 0) { 262 VgprBase = Vgpr; 263 } else if (VgprBase + i != Vgpr) 264 return; 265 266 if (!Op.isUndef()) 267 IsUndef = false; 268 if (!Op.isKill()) 269 IsKill = false; 270 } 271 272 if (VgprBase + NewAddrDwords > 256) 273 return; 274 275 // Further check for implicit tied operands - this may be present if TFE is 276 // enabled 277 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 278 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 279 unsigned TFEVal = MI.getOperand(TFEIdx).getImm(); 280 unsigned LWEVal = MI.getOperand(LWEIdx).getImm(); 281 int ToUntie = -1; 282 if (TFEVal || LWEVal) { 283 // TFE/LWE is enabled so we need to deal with an implicit tied operand 284 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 285 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 286 MI.getOperand(i).isImplicit()) { 287 // This is the tied operand 288 assert( 289 ToUntie == -1 && 290 "found more than one tied implicit operand when expecting only 1"); 291 ToUntie = i; 292 MI.untieRegOperand(ToUntie); 293 } 294 } 295 } 296 297 unsigned NewOpcode = 298 AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, 299 Info->VDataDwords, NewAddrDwords); 300 MI.setDesc(TII->get(NewOpcode)); 301 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 302 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 303 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 304 305 for (unsigned i = 1; i < Info->VAddrDwords; ++i) 306 MI.RemoveOperand(VAddr0Idx + 1); 307 308 if (ToUntie >= 0) { 309 MI.tieOperands( 310 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 311 ToUntie - (Info->VAddrDwords - 1)); 312 } 313 } 314 315 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 316 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 317 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 318 /// XNOR (as a ^ b == ~(a ^ ~b)). 319 /// \returns true if the caller should continue the machine function iterator 320 static bool shrinkScalarLogicOp(const GCNSubtarget &ST, 321 MachineRegisterInfo &MRI, 322 const SIInstrInfo *TII, 323 MachineInstr &MI) { 324 unsigned Opc = MI.getOpcode(); 325 const MachineOperand *Dest = &MI.getOperand(0); 326 MachineOperand *Src0 = &MI.getOperand(1); 327 MachineOperand *Src1 = &MI.getOperand(2); 328 MachineOperand *SrcReg = Src0; 329 MachineOperand *SrcImm = Src1; 330 331 if (!SrcImm->isImm() || 332 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) 333 return false; 334 335 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 336 uint32_t NewImm = 0; 337 338 if (Opc == AMDGPU::S_AND_B32) { 339 if (isPowerOf2_32(~Imm)) { 340 NewImm = countTrailingOnes(Imm); 341 Opc = AMDGPU::S_BITSET0_B32; 342 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 343 NewImm = ~Imm; 344 Opc = AMDGPU::S_ANDN2_B32; 345 } 346 } else if (Opc == AMDGPU::S_OR_B32) { 347 if (isPowerOf2_32(Imm)) { 348 NewImm = countTrailingZeros(Imm); 349 Opc = AMDGPU::S_BITSET1_B32; 350 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 351 NewImm = ~Imm; 352 Opc = AMDGPU::S_ORN2_B32; 353 } 354 } else if (Opc == AMDGPU::S_XOR_B32) { 355 if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 356 NewImm = ~Imm; 357 Opc = AMDGPU::S_XNOR_B32; 358 } 359 } else { 360 llvm_unreachable("unexpected opcode"); 361 } 362 363 if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && 364 SrcImm == Src0) { 365 if (!TII->commuteInstruction(MI, false, 1, 2)) 366 NewImm = 0; 367 } 368 369 if (NewImm != 0) { 370 if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) { 371 MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 372 MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 373 return true; 374 } 375 376 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 377 const bool IsUndef = SrcReg->isUndef(); 378 const bool IsKill = SrcReg->isKill(); 379 MI.setDesc(TII->get(Opc)); 380 if (Opc == AMDGPU::S_BITSET0_B32 || 381 Opc == AMDGPU::S_BITSET1_B32) { 382 Src0->ChangeToImmediate(NewImm); 383 // Remove the immediate and add the tied input. 384 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 385 /*isImp*/ false, IsKill, 386 /*isDead*/ false, IsUndef); 387 MI.tieOperands(0, 2); 388 } else { 389 SrcImm->setImm(NewImm); 390 } 391 } 392 } 393 394 return false; 395 } 396 397 // This is the same as MachineInstr::readsRegister/modifiesRegister except 398 // it takes subregs into account. 399 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 400 unsigned Reg, unsigned SubReg, 401 const SIRegisterInfo &TRI) { 402 for (const MachineOperand &MO : R) { 403 if (!MO.isReg()) 404 continue; 405 406 if (Register::isPhysicalRegister(Reg) && 407 Register::isPhysicalRegister(MO.getReg())) { 408 if (TRI.regsOverlap(Reg, MO.getReg())) 409 return true; 410 } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) { 411 LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & 412 TRI.getSubRegIndexLaneMask(MO.getSubReg()); 413 if (Overlap.any()) 414 return true; 415 } 416 } 417 return false; 418 } 419 420 static bool instReadsReg(const MachineInstr *MI, 421 unsigned Reg, unsigned SubReg, 422 const SIRegisterInfo &TRI) { 423 return instAccessReg(MI->uses(), Reg, SubReg, TRI); 424 } 425 426 static bool instModifiesReg(const MachineInstr *MI, 427 unsigned Reg, unsigned SubReg, 428 const SIRegisterInfo &TRI) { 429 return instAccessReg(MI->defs(), Reg, SubReg, TRI); 430 } 431 432 static TargetInstrInfo::RegSubRegPair 433 getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, 434 const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { 435 if (TRI.getRegSizeInBits(Reg, MRI) != 32) { 436 if (Register::isPhysicalRegister(Reg)) { 437 Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); 438 } else { 439 Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub)); 440 } 441 } 442 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 443 } 444 445 // Match: 446 // mov t, x 447 // mov x, y 448 // mov y, t 449 // 450 // => 451 // 452 // mov t, x (t is potentially dead and move eliminated) 453 // v_swap_b32 x, y 454 // 455 // Returns next valid instruction pointer if was able to create v_swap_b32. 456 // 457 // This shall not be done too early not to prevent possible folding which may 458 // remove matched moves, and this should prefereably be done before RA to 459 // release saved registers and also possibly after RA which can insert copies 460 // too. 461 // 462 // This is really just a generic peephole that is not a canocical shrinking, 463 // although requirements match the pass placement and it reduces code size too. 464 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, 465 const SIInstrInfo *TII) { 466 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 467 MovT.getOpcode() == AMDGPU::COPY); 468 469 Register T = MovT.getOperand(0).getReg(); 470 unsigned Tsub = MovT.getOperand(0).getSubReg(); 471 MachineOperand &Xop = MovT.getOperand(1); 472 473 if (!Xop.isReg()) 474 return nullptr; 475 Register X = Xop.getReg(); 476 unsigned Xsub = Xop.getSubReg(); 477 478 unsigned Size = TII->getOpSize(MovT, 0) / 4; 479 480 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 481 if (!TRI.isVGPR(MRI, X)) 482 return nullptr; 483 484 const unsigned SearchLimit = 16; 485 unsigned Count = 0; 486 for (auto Iter = std::next(MovT.getIterator()), 487 E = MovT.getParent()->instr_end(); 488 Iter != E && Count < SearchLimit; ++Iter, ++Count) { 489 490 MachineInstr *MovY = &*Iter; 491 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 492 MovY->getOpcode() != AMDGPU::COPY) || 493 !MovY->getOperand(1).isReg() || 494 MovY->getOperand(1).getReg() != T || 495 MovY->getOperand(1).getSubReg() != Tsub) 496 continue; 497 498 Register Y = MovY->getOperand(0).getReg(); 499 unsigned Ysub = MovY->getOperand(0).getSubReg(); 500 501 if (!TRI.isVGPR(MRI, Y)) 502 continue; 503 504 MachineInstr *MovX = nullptr; 505 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 506 I != IY; ++I) { 507 if (instReadsReg(&*I, X, Xsub, TRI) || 508 instModifiesReg(&*I, Y, Ysub, TRI) || 509 instModifiesReg(&*I, T, Tsub, TRI) || 510 (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { 511 MovX = nullptr; 512 break; 513 } 514 if (!instReadsReg(&*I, Y, Ysub, TRI)) { 515 if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { 516 MovX = nullptr; 517 break; 518 } 519 continue; 520 } 521 if (MovX || 522 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 523 I->getOpcode() != AMDGPU::COPY) || 524 I->getOperand(0).getReg() != X || 525 I->getOperand(0).getSubReg() != Xsub) { 526 MovX = nullptr; 527 break; 528 } 529 MovX = &*I; 530 } 531 532 if (!MovX) 533 continue; 534 535 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY); 536 537 for (unsigned I = 0; I < Size; ++I) { 538 TargetInstrInfo::RegSubRegPair X1, Y1; 539 X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); 540 Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); 541 BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(), 542 TII->get(AMDGPU::V_SWAP_B32)) 543 .addDef(X1.Reg, 0, X1.SubReg) 544 .addDef(Y1.Reg, 0, Y1.SubReg) 545 .addReg(Y1.Reg, 0, Y1.SubReg) 546 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 547 } 548 MovX->eraseFromParent(); 549 MovY->eraseFromParent(); 550 MachineInstr *Next = &*std::next(MovT.getIterator()); 551 if (MRI.use_nodbg_empty(T)) 552 MovT.eraseFromParent(); 553 else 554 Xop.setIsKill(false); 555 556 return Next; 557 } 558 559 return nullptr; 560 } 561 562 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 563 if (skipFunction(MF.getFunction())) 564 return false; 565 566 MachineRegisterInfo &MRI = MF.getRegInfo(); 567 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 568 const SIInstrInfo *TII = ST.getInstrInfo(); 569 unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 570 571 std::vector<unsigned> I1Defs; 572 573 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 574 BI != BE; ++BI) { 575 576 MachineBasicBlock &MBB = *BI; 577 MachineBasicBlock::iterator I, Next; 578 for (I = MBB.begin(); I != MBB.end(); I = Next) { 579 Next = std::next(I); 580 MachineInstr &MI = *I; 581 582 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 583 // If this has a literal constant source that is the same as the 584 // reversed bits of an inline immediate, replace with a bitreverse of 585 // that constant. This saves 4 bytes in the common case of materializing 586 // sign bits. 587 588 // Test if we are after regalloc. We only want to do this after any 589 // optimizations happen because this will confuse them. 590 // XXX - not exactly a check for post-regalloc run. 591 MachineOperand &Src = MI.getOperand(1); 592 if (Src.isImm() && 593 Register::isPhysicalRegister(MI.getOperand(0).getReg())) { 594 int32_t ReverseImm; 595 if (isReverseInlineImm(TII, Src, ReverseImm)) { 596 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 597 Src.setImm(ReverseImm); 598 continue; 599 } 600 } 601 } 602 603 if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 604 MI.getOpcode() == AMDGPU::COPY)) { 605 if (auto *NextMI = matchSwap(MI, MRI, TII)) { 606 Next = NextMI->getIterator(); 607 continue; 608 } 609 } 610 611 // Combine adjacent s_nops to use the immediate operand encoding how long 612 // to wait. 613 // 614 // s_nop N 615 // s_nop M 616 // => 617 // s_nop (N + M) 618 if (MI.getOpcode() == AMDGPU::S_NOP && 619 MI.getNumOperands() == 1 && // Don't merge with implicit operands 620 Next != MBB.end() && 621 (*Next).getOpcode() == AMDGPU::S_NOP && 622 (*Next).getNumOperands() == 1) { 623 624 MachineInstr &NextMI = *Next; 625 // The instruction encodes the amount to wait with an offset of 1, 626 // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back 627 // after adding. 628 uint8_t Nop0 = MI.getOperand(0).getImm() + 1; 629 uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; 630 631 // Make sure we don't overflow the bounds. 632 if (Nop0 + Nop1 <= 8) { 633 NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); 634 MI.eraseFromParent(); 635 } 636 637 continue; 638 } 639 640 // FIXME: We also need to consider movs of constant operands since 641 // immediate operands are not folded if they have more than one use, and 642 // the operand folding pass is unaware if the immediate will be free since 643 // it won't know if the src == dest constraint will end up being 644 // satisfied. 645 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 646 MI.getOpcode() == AMDGPU::S_MUL_I32) { 647 const MachineOperand *Dest = &MI.getOperand(0); 648 MachineOperand *Src0 = &MI.getOperand(1); 649 MachineOperand *Src1 = &MI.getOperand(2); 650 651 if (!Src0->isReg() && Src1->isReg()) { 652 if (TII->commuteInstruction(MI, false, 1, 2)) 653 std::swap(Src0, Src1); 654 } 655 656 // FIXME: This could work better if hints worked with subregisters. If 657 // we have a vector add of a constant, we usually don't get the correct 658 // allocation due to the subregister usage. 659 if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) { 660 MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 661 MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 662 continue; 663 } 664 665 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 666 if (Src1->isImm() && isKImmOperand(TII, *Src1)) { 667 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 668 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 669 670 MI.setDesc(TII->get(Opc)); 671 MI.tieOperands(0, 1); 672 } 673 } 674 } 675 676 // Try to use s_cmpk_* 677 if (MI.isCompare() && TII->isSOPC(MI)) { 678 shrinkScalarCompare(TII, MI); 679 continue; 680 } 681 682 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 683 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 684 const MachineOperand &Dst = MI.getOperand(0); 685 MachineOperand &Src = MI.getOperand(1); 686 687 if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) { 688 int32_t ReverseImm; 689 if (isKImmOperand(TII, Src)) 690 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 691 else if (isReverseInlineImm(TII, Src, ReverseImm)) { 692 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 693 Src.setImm(ReverseImm); 694 } 695 } 696 697 continue; 698 } 699 700 // Shrink scalar logic operations. 701 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 702 MI.getOpcode() == AMDGPU::S_OR_B32 || 703 MI.getOpcode() == AMDGPU::S_XOR_B32) { 704 if (shrinkScalarLogicOp(ST, MRI, TII, MI)) 705 continue; 706 } 707 708 if (TII->isMIMG(MI.getOpcode()) && 709 ST.getGeneration() >= AMDGPUSubtarget::GFX10 && 710 MF.getProperties().hasProperty( 711 MachineFunctionProperties::Property::NoVRegs)) { 712 shrinkMIMG(MI); 713 continue; 714 } 715 716 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 717 continue; 718 719 if (!TII->canShrink(MI, MRI)) { 720 // Try commuting the instruction and see if that enables us to shrink 721 // it. 722 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 723 !TII->canShrink(MI, MRI)) 724 continue; 725 } 726 727 // getVOPe32 could be -1 here if we started with an instruction that had 728 // a 32-bit encoding and then commuted it to an instruction that did not. 729 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 730 continue; 731 732 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 733 734 if (TII->isVOPC(Op32)) { 735 Register DstReg = MI.getOperand(0).getReg(); 736 if (Register::isVirtualRegister(DstReg)) { 737 // VOPC instructions can only write to the VCC register. We can't 738 // force them to use VCC here, because this is only one register and 739 // cannot deal with sequences which would require multiple copies of 740 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 741 // 742 // So, instead of forcing the instruction to write to VCC, we provide 743 // a hint to the register allocator to use VCC and then we will run 744 // this pass again after RA and shrink it if it outputs to VCC. 745 MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); 746 continue; 747 } 748 if (DstReg != VCCReg) 749 continue; 750 } 751 752 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 753 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 754 // instructions. 755 const MachineOperand *Src2 = 756 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 757 if (!Src2->isReg()) 758 continue; 759 Register SReg = Src2->getReg(); 760 if (Register::isVirtualRegister(SReg)) { 761 MRI.setRegAllocationHint(SReg, 0, VCCReg); 762 continue; 763 } 764 if (SReg != VCCReg) 765 continue; 766 } 767 768 // Check for the bool flag output for instructions like V_ADD_I32_e64. 769 const MachineOperand *SDst = TII->getNamedOperand(MI, 770 AMDGPU::OpName::sdst); 771 772 // Check the carry-in operand for v_addc_u32_e64. 773 const MachineOperand *Src2 = TII->getNamedOperand(MI, 774 AMDGPU::OpName::src2); 775 776 if (SDst) { 777 bool Next = false; 778 779 if (SDst->getReg() != VCCReg) { 780 if (Register::isVirtualRegister(SDst->getReg())) 781 MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); 782 Next = true; 783 } 784 785 // All of the instructions with carry outs also have an SGPR input in 786 // src2. 787 if (Src2 && Src2->getReg() != VCCReg) { 788 if (Register::isVirtualRegister(Src2->getReg())) 789 MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); 790 Next = true; 791 } 792 793 if (Next) 794 continue; 795 } 796 797 // We can shrink this instruction 798 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 799 800 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 801 ++NumInstructionsShrunk; 802 803 // Copy extra operands not present in the instruction definition. 804 copyExtraImplicitOps(*Inst32, MF, MI); 805 806 MI.eraseFromParent(); 807 foldImmediates(*Inst32, TII, MRI); 808 809 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 810 } 811 } 812 return false; 813 } 814