1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "AMDGPUSubtarget.h" 13 #include "SIInstrInfo.h" 14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/IR/Constants.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/IR/LLVMContext.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/raw_ostream.h" 24 #include "llvm/Target/TargetMachine.h" 25 26 #define DEBUG_TYPE "si-shrink-instructions" 27 28 STATISTIC(NumInstructionsShrunk, 29 "Number of 64-bit instruction reduced to 32-bit."); 30 STATISTIC(NumLiteralConstantsFolded, 31 "Number of literal constants folded into 32-bit instructions."); 32 33 using namespace llvm; 34 35 namespace { 36 37 class SIShrinkInstructions : public MachineFunctionPass { 38 public: 39 static char ID; 40 41 void shrinkMIMG(MachineInstr &MI); 42 43 public: 44 SIShrinkInstructions() : MachineFunctionPass(ID) { 45 } 46 47 bool runOnMachineFunction(MachineFunction &MF) override; 48 49 StringRef getPassName() const override { return "SI Shrink Instructions"; } 50 51 void getAnalysisUsage(AnalysisUsage &AU) const override { 52 AU.setPreservesCFG(); 53 MachineFunctionPass::getAnalysisUsage(AU); 54 } 55 }; 56 57 } // End anonymous namespace. 58 59 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 60 "SI Shrink Instructions", false, false) 61 62 char SIShrinkInstructions::ID = 0; 63 64 FunctionPass *llvm::createSIShrinkInstructionsPass() { 65 return new SIShrinkInstructions(); 66 } 67 68 /// This function checks \p MI for operands defined by a move immediate 69 /// instruction and then folds the literal constant into the instruction if it 70 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 71 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, 72 MachineRegisterInfo &MRI, bool TryToCommute = true) { 73 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 74 75 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 76 77 // Try to fold Src0 78 MachineOperand &Src0 = MI.getOperand(Src0Idx); 79 if (Src0.isReg()) { 80 Register Reg = Src0.getReg(); 81 if (Reg.isVirtual() && MRI.hasOneUse(Reg)) { 82 MachineInstr *Def = MRI.getUniqueVRegDef(Reg); 83 if (Def && Def->isMoveImmediate()) { 84 MachineOperand &MovSrc = Def->getOperand(1); 85 bool ConstantFolded = false; 86 87 if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || 88 isUInt<32>(MovSrc.getImm()))) { 89 Src0.ChangeToImmediate(MovSrc.getImm()); 90 ConstantFolded = true; 91 } else if (MovSrc.isFI()) { 92 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 93 ConstantFolded = true; 94 } else if (MovSrc.isGlobal()) { 95 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 96 MovSrc.getTargetFlags()); 97 ConstantFolded = true; 98 } 99 100 if (ConstantFolded) { 101 assert(MRI.use_empty(Reg)); 102 Def->eraseFromParent(); 103 ++NumLiteralConstantsFolded; 104 return true; 105 } 106 } 107 } 108 } 109 110 // We have failed to fold src0, so commute the instruction and try again. 111 if (TryToCommute && MI.isCommutable()) { 112 if (TII->commuteInstruction(MI)) { 113 if (foldImmediates(MI, TII, MRI, false)) 114 return true; 115 116 // Commute back. 117 TII->commuteInstruction(MI); 118 } 119 } 120 121 return false; 122 } 123 124 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 125 return isInt<16>(Src.getImm()) && 126 !TII->isInlineConstant(*Src.getParent(), 127 Src.getParent()->getOperandNo(&Src)); 128 } 129 130 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 131 return isUInt<16>(Src.getImm()) && 132 !TII->isInlineConstant(*Src.getParent(), 133 Src.getParent()->getOperandNo(&Src)); 134 } 135 136 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, 137 const MachineOperand &Src, 138 bool &IsUnsigned) { 139 if (isInt<16>(Src.getImm())) { 140 IsUnsigned = false; 141 return !TII->isInlineConstant(Src); 142 } 143 144 if (isUInt<16>(Src.getImm())) { 145 IsUnsigned = true; 146 return !TII->isInlineConstant(Src); 147 } 148 149 return false; 150 } 151 152 /// \returns true if the constant in \p Src should be replaced with a bitreverse 153 /// of an inline immediate. 154 static bool isReverseInlineImm(const SIInstrInfo *TII, 155 const MachineOperand &Src, 156 int32_t &ReverseImm) { 157 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 158 return false; 159 160 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 161 return ReverseImm >= -16 && ReverseImm <= 64; 162 } 163 164 /// Copy implicit register operands from specified instruction to this 165 /// instruction that are not part of the instruction definition. 166 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, 167 const MachineInstr &MI) { 168 for (unsigned i = MI.getDesc().getNumOperands() + 169 MI.getDesc().getNumImplicitUses() + 170 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 171 i != e; ++i) { 172 const MachineOperand &MO = MI.getOperand(i); 173 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 174 NewMI.addOperand(MF, MO); 175 } 176 } 177 178 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { 179 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 180 // get constants on the RHS. 181 if (!MI.getOperand(0).isReg()) 182 TII->commuteInstruction(MI, false, 0, 1); 183 184 // cmpk requires src0 to be a register 185 const MachineOperand &Src0 = MI.getOperand(0); 186 if (!Src0.isReg()) 187 return; 188 189 const MachineOperand &Src1 = MI.getOperand(1); 190 if (!Src1.isImm()) 191 return; 192 193 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 194 if (SOPKOpc == -1) 195 return; 196 197 // eq/ne is special because the imm16 can be treated as signed or unsigned, 198 // and initially selectd to the unsigned versions. 199 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 200 bool HasUImm; 201 if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { 202 if (!HasUImm) { 203 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 204 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 205 } 206 207 MI.setDesc(TII->get(SOPKOpc)); 208 } 209 210 return; 211 } 212 213 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 214 215 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || 216 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { 217 MI.setDesc(NewDesc); 218 } 219 } 220 221 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 222 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { 223 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 224 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 225 return; 226 227 MachineFunction *MF = MI.getParent()->getParent(); 228 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 229 const SIInstrInfo *TII = ST.getInstrInfo(); 230 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 231 int VAddr0Idx = 232 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 233 unsigned NewAddrDwords = Info->VAddrDwords; 234 const TargetRegisterClass *RC; 235 236 if (Info->VAddrDwords == 2) { 237 RC = &AMDGPU::VReg_64RegClass; 238 } else if (Info->VAddrDwords == 3) { 239 RC = &AMDGPU::VReg_96RegClass; 240 } else if (Info->VAddrDwords == 4) { 241 RC = &AMDGPU::VReg_128RegClass; 242 } else if (Info->VAddrDwords <= 8) { 243 RC = &AMDGPU::VReg_256RegClass; 244 NewAddrDwords = 8; 245 } else { 246 RC = &AMDGPU::VReg_512RegClass; 247 NewAddrDwords = 16; 248 } 249 250 unsigned VgprBase = 0; 251 bool IsUndef = true; 252 bool IsKill = NewAddrDwords == Info->VAddrDwords; 253 for (unsigned i = 0; i < Info->VAddrDwords; ++i) { 254 const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); 255 unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); 256 257 if (i == 0) { 258 VgprBase = Vgpr; 259 } else if (VgprBase + i != Vgpr) 260 return; 261 262 if (!Op.isUndef()) 263 IsUndef = false; 264 if (!Op.isKill()) 265 IsKill = false; 266 } 267 268 if (VgprBase + NewAddrDwords > 256) 269 return; 270 271 // Further check for implicit tied operands - this may be present if TFE is 272 // enabled 273 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 274 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 275 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 276 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 277 int ToUntie = -1; 278 if (TFEVal || LWEVal) { 279 // TFE/LWE is enabled so we need to deal with an implicit tied operand 280 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 281 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 282 MI.getOperand(i).isImplicit()) { 283 // This is the tied operand 284 assert( 285 ToUntie == -1 && 286 "found more than one tied implicit operand when expecting only 1"); 287 ToUntie = i; 288 MI.untieRegOperand(ToUntie); 289 } 290 } 291 } 292 293 unsigned NewOpcode = 294 AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, 295 Info->VDataDwords, NewAddrDwords); 296 MI.setDesc(TII->get(NewOpcode)); 297 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 298 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 299 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 300 301 for (unsigned i = 1; i < Info->VAddrDwords; ++i) 302 MI.RemoveOperand(VAddr0Idx + 1); 303 304 if (ToUntie >= 0) { 305 MI.tieOperands( 306 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 307 ToUntie - (Info->VAddrDwords - 1)); 308 } 309 } 310 311 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 312 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 313 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 314 /// XNOR (as a ^ b == ~(a ^ ~b)). 315 /// \returns true if the caller should continue the machine function iterator 316 static bool shrinkScalarLogicOp(const GCNSubtarget &ST, 317 MachineRegisterInfo &MRI, 318 const SIInstrInfo *TII, 319 MachineInstr &MI) { 320 unsigned Opc = MI.getOpcode(); 321 const MachineOperand *Dest = &MI.getOperand(0); 322 MachineOperand *Src0 = &MI.getOperand(1); 323 MachineOperand *Src1 = &MI.getOperand(2); 324 MachineOperand *SrcReg = Src0; 325 MachineOperand *SrcImm = Src1; 326 327 if (!SrcImm->isImm() || 328 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) 329 return false; 330 331 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 332 uint32_t NewImm = 0; 333 334 if (Opc == AMDGPU::S_AND_B32) { 335 if (isPowerOf2_32(~Imm)) { 336 NewImm = countTrailingOnes(Imm); 337 Opc = AMDGPU::S_BITSET0_B32; 338 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 339 NewImm = ~Imm; 340 Opc = AMDGPU::S_ANDN2_B32; 341 } 342 } else if (Opc == AMDGPU::S_OR_B32) { 343 if (isPowerOf2_32(Imm)) { 344 NewImm = countTrailingZeros(Imm); 345 Opc = AMDGPU::S_BITSET1_B32; 346 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 347 NewImm = ~Imm; 348 Opc = AMDGPU::S_ORN2_B32; 349 } 350 } else if (Opc == AMDGPU::S_XOR_B32) { 351 if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 352 NewImm = ~Imm; 353 Opc = AMDGPU::S_XNOR_B32; 354 } 355 } else { 356 llvm_unreachable("unexpected opcode"); 357 } 358 359 if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && 360 SrcImm == Src0) { 361 if (!TII->commuteInstruction(MI, false, 1, 2)) 362 NewImm = 0; 363 } 364 365 if (NewImm != 0) { 366 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 367 MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 368 MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 369 return true; 370 } 371 372 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 373 const bool IsUndef = SrcReg->isUndef(); 374 const bool IsKill = SrcReg->isKill(); 375 MI.setDesc(TII->get(Opc)); 376 if (Opc == AMDGPU::S_BITSET0_B32 || 377 Opc == AMDGPU::S_BITSET1_B32) { 378 Src0->ChangeToImmediate(NewImm); 379 // Remove the immediate and add the tied input. 380 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 381 /*isImp*/ false, IsKill, 382 /*isDead*/ false, IsUndef); 383 MI.tieOperands(0, 2); 384 } else { 385 SrcImm->setImm(NewImm); 386 } 387 } 388 } 389 390 return false; 391 } 392 393 // This is the same as MachineInstr::readsRegister/modifiesRegister except 394 // it takes subregs into account. 395 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 396 Register Reg, unsigned SubReg, 397 const SIRegisterInfo &TRI) { 398 for (const MachineOperand &MO : R) { 399 if (!MO.isReg()) 400 continue; 401 402 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 403 if (TRI.regsOverlap(Reg, MO.getReg())) 404 return true; 405 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 406 LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & 407 TRI.getSubRegIndexLaneMask(MO.getSubReg()); 408 if (Overlap.any()) 409 return true; 410 } 411 } 412 return false; 413 } 414 415 static bool instReadsReg(const MachineInstr *MI, 416 unsigned Reg, unsigned SubReg, 417 const SIRegisterInfo &TRI) { 418 return instAccessReg(MI->uses(), Reg, SubReg, TRI); 419 } 420 421 static bool instModifiesReg(const MachineInstr *MI, 422 unsigned Reg, unsigned SubReg, 423 const SIRegisterInfo &TRI) { 424 return instAccessReg(MI->defs(), Reg, SubReg, TRI); 425 } 426 427 static TargetInstrInfo::RegSubRegPair 428 getSubRegForIndex(Register Reg, unsigned Sub, unsigned I, 429 const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { 430 if (TRI.getRegSizeInBits(Reg, MRI) != 32) { 431 if (Reg.isPhysical()) { 432 Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); 433 } else { 434 Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub)); 435 } 436 } 437 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 438 } 439 440 static void dropInstructionKeepingImpDefs(MachineInstr &MI, 441 const SIInstrInfo *TII) { 442 for (unsigned i = MI.getDesc().getNumOperands() + 443 MI.getDesc().getNumImplicitUses() + 444 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 445 i != e; ++i) { 446 const MachineOperand &Op = MI.getOperand(i); 447 if (!Op.isDef()) 448 continue; 449 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 450 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 451 } 452 453 MI.eraseFromParent(); 454 } 455 456 // Match: 457 // mov t, x 458 // mov x, y 459 // mov y, t 460 // 461 // => 462 // 463 // mov t, x (t is potentially dead and move eliminated) 464 // v_swap_b32 x, y 465 // 466 // Returns next valid instruction pointer if was able to create v_swap_b32. 467 // 468 // This shall not be done too early not to prevent possible folding which may 469 // remove matched moves, and this should prefereably be done before RA to 470 // release saved registers and also possibly after RA which can insert copies 471 // too. 472 // 473 // This is really just a generic peephole that is not a canocical shrinking, 474 // although requirements match the pass placement and it reduces code size too. 475 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, 476 const SIInstrInfo *TII) { 477 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 478 MovT.getOpcode() == AMDGPU::COPY); 479 480 Register T = MovT.getOperand(0).getReg(); 481 unsigned Tsub = MovT.getOperand(0).getSubReg(); 482 MachineOperand &Xop = MovT.getOperand(1); 483 484 if (!Xop.isReg()) 485 return nullptr; 486 Register X = Xop.getReg(); 487 unsigned Xsub = Xop.getSubReg(); 488 489 unsigned Size = TII->getOpSize(MovT, 0) / 4; 490 491 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 492 if (!TRI.isVGPR(MRI, X)) 493 return nullptr; 494 495 if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) 496 return nullptr; 497 498 const unsigned SearchLimit = 16; 499 unsigned Count = 0; 500 bool KilledT = false; 501 for (auto Iter = std::next(MovT.getIterator()), 502 E = MovT.getParent()->instr_end(); 503 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 504 505 MachineInstr *MovY = &*Iter; 506 KilledT = MovY->killsRegister(T, &TRI); 507 508 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 509 MovY->getOpcode() != AMDGPU::COPY) || 510 !MovY->getOperand(1).isReg() || 511 MovY->getOperand(1).getReg() != T || 512 MovY->getOperand(1).getSubReg() != Tsub || 513 MovY->hasRegisterImplicitUseOperand(AMDGPU::M0)) 514 continue; 515 516 Register Y = MovY->getOperand(0).getReg(); 517 unsigned Ysub = MovY->getOperand(0).getSubReg(); 518 519 if (!TRI.isVGPR(MRI, Y)) 520 continue; 521 522 MachineInstr *MovX = nullptr; 523 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 524 I != IY; ++I) { 525 if (instReadsReg(&*I, X, Xsub, TRI) || 526 instModifiesReg(&*I, Y, Ysub, TRI) || 527 instModifiesReg(&*I, T, Tsub, TRI) || 528 (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { 529 MovX = nullptr; 530 break; 531 } 532 if (!instReadsReg(&*I, Y, Ysub, TRI)) { 533 if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { 534 MovX = nullptr; 535 break; 536 } 537 continue; 538 } 539 if (MovX || 540 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 541 I->getOpcode() != AMDGPU::COPY) || 542 I->getOperand(0).getReg() != X || 543 I->getOperand(0).getSubReg() != Xsub) { 544 MovX = nullptr; 545 break; 546 } 547 // Implicit use of M0 is an indirect move. 548 if (I->hasRegisterImplicitUseOperand(AMDGPU::M0)) 549 continue; 550 551 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 552 continue; 553 554 MovX = &*I; 555 } 556 557 if (!MovX) 558 continue; 559 560 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 561 562 for (unsigned I = 0; I < Size; ++I) { 563 TargetInstrInfo::RegSubRegPair X1, Y1; 564 X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); 565 Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); 566 MachineBasicBlock &MBB = *MovT.getParent(); 567 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 568 TII->get(AMDGPU::V_SWAP_B32)) 569 .addDef(X1.Reg, 0, X1.SubReg) 570 .addDef(Y1.Reg, 0, Y1.SubReg) 571 .addReg(Y1.Reg, 0, Y1.SubReg) 572 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 573 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 574 // Drop implicit EXEC. 575 MIB->RemoveOperand(MIB->getNumExplicitOperands()); 576 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 577 } 578 } 579 MovX->eraseFromParent(); 580 dropInstructionKeepingImpDefs(*MovY, TII); 581 MachineInstr *Next = &*std::next(MovT.getIterator()); 582 583 if (MRI.use_nodbg_empty(T)) { 584 dropInstructionKeepingImpDefs(MovT, TII); 585 } else { 586 Xop.setIsKill(false); 587 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 588 unsigned OpNo = MovT.getNumExplicitOperands() + I; 589 const MachineOperand &Op = MovT.getOperand(OpNo); 590 if (Op.isKill() && TRI.regsOverlap(X, Op.getReg())) 591 MovT.RemoveOperand(OpNo); 592 } 593 } 594 595 return Next; 596 } 597 598 return nullptr; 599 } 600 601 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 602 if (skipFunction(MF.getFunction())) 603 return false; 604 605 MachineRegisterInfo &MRI = MF.getRegInfo(); 606 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 607 const SIInstrInfo *TII = ST.getInstrInfo(); 608 unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 609 610 std::vector<unsigned> I1Defs; 611 612 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 613 BI != BE; ++BI) { 614 615 MachineBasicBlock &MBB = *BI; 616 MachineBasicBlock::iterator I, Next; 617 for (I = MBB.begin(); I != MBB.end(); I = Next) { 618 Next = std::next(I); 619 MachineInstr &MI = *I; 620 621 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 622 // If this has a literal constant source that is the same as the 623 // reversed bits of an inline immediate, replace with a bitreverse of 624 // that constant. This saves 4 bytes in the common case of materializing 625 // sign bits. 626 627 // Test if we are after regalloc. We only want to do this after any 628 // optimizations happen because this will confuse them. 629 // XXX - not exactly a check for post-regalloc run. 630 MachineOperand &Src = MI.getOperand(1); 631 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 632 int32_t ReverseImm; 633 if (isReverseInlineImm(TII, Src, ReverseImm)) { 634 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 635 Src.setImm(ReverseImm); 636 continue; 637 } 638 } 639 } 640 641 if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 642 MI.getOpcode() == AMDGPU::COPY)) { 643 if (auto *NextMI = matchSwap(MI, MRI, TII)) { 644 Next = NextMI->getIterator(); 645 continue; 646 } 647 } 648 649 // FIXME: We also need to consider movs of constant operands since 650 // immediate operands are not folded if they have more than one use, and 651 // the operand folding pass is unaware if the immediate will be free since 652 // it won't know if the src == dest constraint will end up being 653 // satisfied. 654 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 655 MI.getOpcode() == AMDGPU::S_MUL_I32) { 656 const MachineOperand *Dest = &MI.getOperand(0); 657 MachineOperand *Src0 = &MI.getOperand(1); 658 MachineOperand *Src1 = &MI.getOperand(2); 659 660 if (!Src0->isReg() && Src1->isReg()) { 661 if (TII->commuteInstruction(MI, false, 1, 2)) 662 std::swap(Src0, Src1); 663 } 664 665 // FIXME: This could work better if hints worked with subregisters. If 666 // we have a vector add of a constant, we usually don't get the correct 667 // allocation due to the subregister usage. 668 if (Dest->getReg().isVirtual() && Src0->isReg()) { 669 MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 670 MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 671 continue; 672 } 673 674 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 675 if (Src1->isImm() && isKImmOperand(TII, *Src1)) { 676 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 677 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 678 679 MI.setDesc(TII->get(Opc)); 680 MI.tieOperands(0, 1); 681 } 682 } 683 } 684 685 // Try to use s_cmpk_* 686 if (MI.isCompare() && TII->isSOPC(MI)) { 687 shrinkScalarCompare(TII, MI); 688 continue; 689 } 690 691 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 692 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 693 const MachineOperand &Dst = MI.getOperand(0); 694 MachineOperand &Src = MI.getOperand(1); 695 696 if (Src.isImm() && Dst.getReg().isPhysical()) { 697 int32_t ReverseImm; 698 if (isKImmOperand(TII, Src)) 699 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 700 else if (isReverseInlineImm(TII, Src, ReverseImm)) { 701 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 702 Src.setImm(ReverseImm); 703 } 704 } 705 706 continue; 707 } 708 709 // Shrink scalar logic operations. 710 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 711 MI.getOpcode() == AMDGPU::S_OR_B32 || 712 MI.getOpcode() == AMDGPU::S_XOR_B32) { 713 if (shrinkScalarLogicOp(ST, MRI, TII, MI)) 714 continue; 715 } 716 717 if (TII->isMIMG(MI.getOpcode()) && 718 ST.getGeneration() >= AMDGPUSubtarget::GFX10 && 719 MF.getProperties().hasProperty( 720 MachineFunctionProperties::Property::NoVRegs)) { 721 shrinkMIMG(MI); 722 continue; 723 } 724 725 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 726 continue; 727 728 if (!TII->canShrink(MI, MRI)) { 729 // Try commuting the instruction and see if that enables us to shrink 730 // it. 731 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 732 !TII->canShrink(MI, MRI)) 733 continue; 734 } 735 736 // getVOPe32 could be -1 here if we started with an instruction that had 737 // a 32-bit encoding and then commuted it to an instruction that did not. 738 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 739 continue; 740 741 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 742 743 if (TII->isVOPC(Op32)) { 744 Register DstReg = MI.getOperand(0).getReg(); 745 if (DstReg.isVirtual()) { 746 // VOPC instructions can only write to the VCC register. We can't 747 // force them to use VCC here, because this is only one register and 748 // cannot deal with sequences which would require multiple copies of 749 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 750 // 751 // So, instead of forcing the instruction to write to VCC, we provide 752 // a hint to the register allocator to use VCC and then we will run 753 // this pass again after RA and shrink it if it outputs to VCC. 754 MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); 755 continue; 756 } 757 if (DstReg != VCCReg) 758 continue; 759 } 760 761 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 762 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 763 // instructions. 764 const MachineOperand *Src2 = 765 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 766 if (!Src2->isReg()) 767 continue; 768 Register SReg = Src2->getReg(); 769 if (SReg.isVirtual()) { 770 MRI.setRegAllocationHint(SReg, 0, VCCReg); 771 continue; 772 } 773 if (SReg != VCCReg) 774 continue; 775 } 776 777 // Check for the bool flag output for instructions like V_ADD_I32_e64. 778 const MachineOperand *SDst = TII->getNamedOperand(MI, 779 AMDGPU::OpName::sdst); 780 781 // Check the carry-in operand for v_addc_u32_e64. 782 const MachineOperand *Src2 = TII->getNamedOperand(MI, 783 AMDGPU::OpName::src2); 784 785 if (SDst) { 786 bool Next = false; 787 788 if (SDst->getReg() != VCCReg) { 789 if (SDst->getReg().isVirtual()) 790 MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); 791 Next = true; 792 } 793 794 // All of the instructions with carry outs also have an SGPR input in 795 // src2. 796 if (Src2 && Src2->getReg() != VCCReg) { 797 if (Src2->getReg().isVirtual()) 798 MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); 799 Next = true; 800 } 801 802 if (Next) 803 continue; 804 } 805 806 // We can shrink this instruction 807 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 808 809 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 810 ++NumInstructionsShrunk; 811 812 // Copy extra operands not present in the instruction definition. 813 copyExtraImplicitOps(*Inst32, MF, MI); 814 815 MI.eraseFromParent(); 816 foldImmediates(*Inst32, TII, MRI); 817 818 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 819 } 820 } 821 return false; 822 } 823