1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "llvm/ADT/Statistic.h" 15 #include "llvm/CodeGen/MachineFunctionPass.h" 16 17 #define DEBUG_TYPE "si-shrink-instructions" 18 19 STATISTIC(NumInstructionsShrunk, 20 "Number of 64-bit instruction reduced to 32-bit."); 21 STATISTIC(NumLiteralConstantsFolded, 22 "Number of literal constants folded into 32-bit instructions."); 23 24 using namespace llvm; 25 26 namespace { 27 28 class SIShrinkInstructions : public MachineFunctionPass { 29 MachineRegisterInfo *MRI; 30 const GCNSubtarget *ST; 31 const SIInstrInfo *TII; 32 const SIRegisterInfo *TRI; 33 34 public: 35 static char ID; 36 37 public: 38 SIShrinkInstructions() : MachineFunctionPass(ID) { 39 } 40 41 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 42 bool isKImmOperand(const MachineOperand &Src) const; 43 bool isKUImmOperand(const MachineOperand &Src) const; 44 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 45 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; 46 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 47 void shrinkScalarCompare(MachineInstr &MI) const; 48 void shrinkMIMG(MachineInstr &MI) const; 49 bool shrinkScalarLogicOp(MachineInstr &MI) const; 50 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 51 Register Reg, unsigned SubReg) const; 52 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 53 unsigned SubReg) const; 54 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 55 unsigned SubReg) const; 56 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 57 unsigned I) const; 58 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 59 MachineInstr *matchSwap(MachineInstr &MovT) const; 60 61 bool runOnMachineFunction(MachineFunction &MF) override; 62 63 StringRef getPassName() const override { return "SI Shrink Instructions"; } 64 65 void getAnalysisUsage(AnalysisUsage &AU) const override { 66 AU.setPreservesCFG(); 67 MachineFunctionPass::getAnalysisUsage(AU); 68 } 69 }; 70 71 } // End anonymous namespace. 72 73 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 74 "SI Shrink Instructions", false, false) 75 76 char SIShrinkInstructions::ID = 0; 77 78 FunctionPass *llvm::createSIShrinkInstructionsPass() { 79 return new SIShrinkInstructions(); 80 } 81 82 /// This function checks \p MI for operands defined by a move immediate 83 /// instruction and then folds the literal constant into the instruction if it 84 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 85 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 86 bool TryToCommute) const { 87 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 88 89 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 90 91 // Try to fold Src0 92 MachineOperand &Src0 = MI.getOperand(Src0Idx); 93 if (Src0.isReg()) { 94 Register Reg = Src0.getReg(); 95 if (Reg.isVirtual() && MRI->hasOneUse(Reg)) { 96 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 97 if (Def && Def->isMoveImmediate()) { 98 MachineOperand &MovSrc = Def->getOperand(1); 99 bool ConstantFolded = false; 100 101 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 102 if (MovSrc.isImm() && 103 (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { 104 Src0.ChangeToImmediate(MovSrc.getImm()); 105 ConstantFolded = true; 106 } else if (MovSrc.isFI()) { 107 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 108 ConstantFolded = true; 109 } else if (MovSrc.isGlobal()) { 110 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 111 MovSrc.getTargetFlags()); 112 ConstantFolded = true; 113 } 114 } 115 116 if (ConstantFolded) { 117 assert(MRI->use_empty(Reg)); 118 Def->eraseFromParent(); 119 ++NumLiteralConstantsFolded; 120 return true; 121 } 122 } 123 } 124 } 125 126 // We have failed to fold src0, so commute the instruction and try again. 127 if (TryToCommute && MI.isCommutable()) { 128 if (TII->commuteInstruction(MI)) { 129 if (foldImmediates(MI, false)) 130 return true; 131 132 // Commute back. 133 TII->commuteInstruction(MI); 134 } 135 } 136 137 return false; 138 } 139 140 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 141 return isInt<16>(Src.getImm()) && 142 !TII->isInlineConstant(*Src.getParent(), 143 Src.getParent()->getOperandNo(&Src)); 144 } 145 146 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 147 return isUInt<16>(Src.getImm()) && 148 !TII->isInlineConstant(*Src.getParent(), 149 Src.getParent()->getOperandNo(&Src)); 150 } 151 152 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 153 bool &IsUnsigned) const { 154 if (isInt<16>(Src.getImm())) { 155 IsUnsigned = false; 156 return !TII->isInlineConstant(Src); 157 } 158 159 if (isUInt<16>(Src.getImm())) { 160 IsUnsigned = true; 161 return !TII->isInlineConstant(Src); 162 } 163 164 return false; 165 } 166 167 /// \returns true if the constant in \p Src should be replaced with a bitreverse 168 /// of an inline immediate. 169 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, 170 int32_t &ReverseImm) const { 171 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 172 return false; 173 174 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 175 return ReverseImm >= -16 && ReverseImm <= 64; 176 } 177 178 /// Copy implicit register operands from specified instruction to this 179 /// instruction that are not part of the instruction definition. 180 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 181 MachineInstr &MI) const { 182 MachineFunction &MF = *MI.getMF(); 183 for (unsigned i = MI.getDesc().getNumOperands() + 184 MI.getDesc().getNumImplicitUses() + 185 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 186 i != e; ++i) { 187 const MachineOperand &MO = MI.getOperand(i); 188 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 189 NewMI.addOperand(MF, MO); 190 } 191 } 192 193 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 194 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 195 // get constants on the RHS. 196 if (!MI.getOperand(0).isReg()) 197 TII->commuteInstruction(MI, false, 0, 1); 198 199 // cmpk requires src0 to be a register 200 const MachineOperand &Src0 = MI.getOperand(0); 201 if (!Src0.isReg()) 202 return; 203 204 const MachineOperand &Src1 = MI.getOperand(1); 205 if (!Src1.isImm()) 206 return; 207 208 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 209 if (SOPKOpc == -1) 210 return; 211 212 // eq/ne is special because the imm16 can be treated as signed or unsigned, 213 // and initially selected to the unsigned versions. 214 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 215 bool HasUImm; 216 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 217 if (!HasUImm) { 218 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 219 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 220 } 221 222 MI.setDesc(TII->get(SOPKOpc)); 223 } 224 225 return; 226 } 227 228 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 229 230 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 231 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 232 MI.setDesc(NewDesc); 233 } 234 } 235 236 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 237 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 238 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 239 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 240 return; 241 242 int VAddr0Idx = 243 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 244 unsigned NewAddrDwords = Info->VAddrDwords; 245 const TargetRegisterClass *RC; 246 247 if (Info->VAddrDwords == 2) { 248 RC = &AMDGPU::VReg_64RegClass; 249 } else if (Info->VAddrDwords == 3) { 250 RC = &AMDGPU::VReg_96RegClass; 251 } else if (Info->VAddrDwords == 4) { 252 RC = &AMDGPU::VReg_128RegClass; 253 } else if (Info->VAddrDwords == 5) { 254 RC = &AMDGPU::VReg_160RegClass; 255 } else if (Info->VAddrDwords == 6) { 256 RC = &AMDGPU::VReg_192RegClass; 257 } else if (Info->VAddrDwords == 7) { 258 RC = &AMDGPU::VReg_224RegClass; 259 } else if (Info->VAddrDwords == 8) { 260 RC = &AMDGPU::VReg_256RegClass; 261 } else { 262 RC = &AMDGPU::VReg_512RegClass; 263 NewAddrDwords = 16; 264 } 265 266 unsigned VgprBase = 0; 267 bool IsUndef = true; 268 bool IsKill = NewAddrDwords == Info->VAddrDwords; 269 for (unsigned i = 0; i < Info->VAddrDwords; ++i) { 270 const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); 271 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 272 273 if (i == 0) { 274 VgprBase = Vgpr; 275 } else if (VgprBase + i != Vgpr) 276 return; 277 278 if (!Op.isUndef()) 279 IsUndef = false; 280 if (!Op.isKill()) 281 IsKill = false; 282 } 283 284 if (VgprBase + NewAddrDwords > 256) 285 return; 286 287 // Further check for implicit tied operands - this may be present if TFE is 288 // enabled 289 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 290 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 291 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 292 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 293 int ToUntie = -1; 294 if (TFEVal || LWEVal) { 295 // TFE/LWE is enabled so we need to deal with an implicit tied operand 296 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 297 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 298 MI.getOperand(i).isImplicit()) { 299 // This is the tied operand 300 assert( 301 ToUntie == -1 && 302 "found more than one tied implicit operand when expecting only 1"); 303 ToUntie = i; 304 MI.untieRegOperand(ToUntie); 305 } 306 } 307 } 308 309 unsigned NewOpcode = 310 AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, 311 Info->VDataDwords, NewAddrDwords); 312 MI.setDesc(TII->get(NewOpcode)); 313 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 314 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 315 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 316 317 for (unsigned i = 1; i < Info->VAddrDwords; ++i) 318 MI.removeOperand(VAddr0Idx + 1); 319 320 if (ToUntie >= 0) { 321 MI.tieOperands( 322 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 323 ToUntie - (Info->VAddrDwords - 1)); 324 } 325 } 326 327 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 328 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 329 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 330 /// XNOR (as a ^ b == ~(a ^ ~b)). 331 /// \returns true if the caller should continue the machine function iterator 332 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 333 unsigned Opc = MI.getOpcode(); 334 const MachineOperand *Dest = &MI.getOperand(0); 335 MachineOperand *Src0 = &MI.getOperand(1); 336 MachineOperand *Src1 = &MI.getOperand(2); 337 MachineOperand *SrcReg = Src0; 338 MachineOperand *SrcImm = Src1; 339 340 if (!SrcImm->isImm() || 341 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 342 return false; 343 344 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 345 uint32_t NewImm = 0; 346 347 if (Opc == AMDGPU::S_AND_B32) { 348 if (isPowerOf2_32(~Imm)) { 349 NewImm = countTrailingOnes(Imm); 350 Opc = AMDGPU::S_BITSET0_B32; 351 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 352 NewImm = ~Imm; 353 Opc = AMDGPU::S_ANDN2_B32; 354 } 355 } else if (Opc == AMDGPU::S_OR_B32) { 356 if (isPowerOf2_32(Imm)) { 357 NewImm = countTrailingZeros(Imm); 358 Opc = AMDGPU::S_BITSET1_B32; 359 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 360 NewImm = ~Imm; 361 Opc = AMDGPU::S_ORN2_B32; 362 } 363 } else if (Opc == AMDGPU::S_XOR_B32) { 364 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 365 NewImm = ~Imm; 366 Opc = AMDGPU::S_XNOR_B32; 367 } 368 } else { 369 llvm_unreachable("unexpected opcode"); 370 } 371 372 if (NewImm != 0) { 373 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 374 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 375 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 376 return true; 377 } 378 379 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 380 const bool IsUndef = SrcReg->isUndef(); 381 const bool IsKill = SrcReg->isKill(); 382 MI.setDesc(TII->get(Opc)); 383 if (Opc == AMDGPU::S_BITSET0_B32 || 384 Opc == AMDGPU::S_BITSET1_B32) { 385 Src0->ChangeToImmediate(NewImm); 386 // Remove the immediate and add the tied input. 387 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 388 /*isImp*/ false, IsKill, 389 /*isDead*/ false, IsUndef); 390 MI.tieOperands(0, 2); 391 } else { 392 SrcImm->setImm(NewImm); 393 } 394 } 395 } 396 397 return false; 398 } 399 400 // This is the same as MachineInstr::readsRegister/modifiesRegister except 401 // it takes subregs into account. 402 bool SIShrinkInstructions::instAccessReg( 403 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 404 unsigned SubReg) const { 405 for (const MachineOperand &MO : R) { 406 if (!MO.isReg()) 407 continue; 408 409 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 410 if (TRI->regsOverlap(Reg, MO.getReg())) 411 return true; 412 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 413 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 414 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 415 if (Overlap.any()) 416 return true; 417 } 418 } 419 return false; 420 } 421 422 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 423 unsigned SubReg) const { 424 return instAccessReg(MI->uses(), Reg, SubReg); 425 } 426 427 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 428 unsigned SubReg) const { 429 return instAccessReg(MI->defs(), Reg, SubReg); 430 } 431 432 TargetInstrInfo::RegSubRegPair 433 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 434 unsigned I) const { 435 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 436 if (Reg.isPhysical()) { 437 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 438 } else { 439 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 440 } 441 } 442 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 443 } 444 445 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 446 MachineInstr &MI) const { 447 for (unsigned i = MI.getDesc().getNumOperands() + 448 MI.getDesc().getNumImplicitUses() + 449 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 450 i != e; ++i) { 451 const MachineOperand &Op = MI.getOperand(i); 452 if (!Op.isDef()) 453 continue; 454 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 455 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 456 } 457 458 MI.eraseFromParent(); 459 } 460 461 // Match: 462 // mov t, x 463 // mov x, y 464 // mov y, t 465 // 466 // => 467 // 468 // mov t, x (t is potentially dead and move eliminated) 469 // v_swap_b32 x, y 470 // 471 // Returns next valid instruction pointer if was able to create v_swap_b32. 472 // 473 // This shall not be done too early not to prevent possible folding which may 474 // remove matched moves, and this should preferably be done before RA to 475 // release saved registers and also possibly after RA which can insert copies 476 // too. 477 // 478 // This is really just a generic peephole that is not a canonical shrinking, 479 // although requirements match the pass placement and it reduces code size too. 480 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 481 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 482 MovT.getOpcode() == AMDGPU::COPY); 483 484 Register T = MovT.getOperand(0).getReg(); 485 unsigned Tsub = MovT.getOperand(0).getSubReg(); 486 MachineOperand &Xop = MovT.getOperand(1); 487 488 if (!Xop.isReg()) 489 return nullptr; 490 Register X = Xop.getReg(); 491 unsigned Xsub = Xop.getSubReg(); 492 493 unsigned Size = TII->getOpSize(MovT, 0) / 4; 494 495 if (!TRI->isVGPR(*MRI, X)) 496 return nullptr; 497 498 if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) 499 return nullptr; 500 501 const unsigned SearchLimit = 16; 502 unsigned Count = 0; 503 bool KilledT = false; 504 for (auto Iter = std::next(MovT.getIterator()), 505 E = MovT.getParent()->instr_end(); 506 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 507 508 MachineInstr *MovY = &*Iter; 509 KilledT = MovY->killsRegister(T, TRI); 510 511 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 512 MovY->getOpcode() != AMDGPU::COPY) || 513 !MovY->getOperand(1).isReg() || 514 MovY->getOperand(1).getReg() != T || 515 MovY->getOperand(1).getSubReg() != Tsub || 516 MovY->hasRegisterImplicitUseOperand(AMDGPU::M0)) 517 continue; 518 519 Register Y = MovY->getOperand(0).getReg(); 520 unsigned Ysub = MovY->getOperand(0).getSubReg(); 521 522 if (!TRI->isVGPR(*MRI, Y)) 523 continue; 524 525 MachineInstr *MovX = nullptr; 526 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 527 I != IY; ++I) { 528 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 529 instModifiesReg(&*I, T, Tsub) || 530 (MovX && instModifiesReg(&*I, X, Xsub))) { 531 MovX = nullptr; 532 break; 533 } 534 if (!instReadsReg(&*I, Y, Ysub)) { 535 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 536 MovX = nullptr; 537 break; 538 } 539 continue; 540 } 541 if (MovX || 542 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 543 I->getOpcode() != AMDGPU::COPY) || 544 I->getOperand(0).getReg() != X || 545 I->getOperand(0).getSubReg() != Xsub) { 546 MovX = nullptr; 547 break; 548 } 549 // Implicit use of M0 is an indirect move. 550 if (I->hasRegisterImplicitUseOperand(AMDGPU::M0)) 551 continue; 552 553 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 554 continue; 555 556 MovX = &*I; 557 } 558 559 if (!MovX) 560 continue; 561 562 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 563 564 for (unsigned I = 0; I < Size; ++I) { 565 TargetInstrInfo::RegSubRegPair X1, Y1; 566 X1 = getSubRegForIndex(X, Xsub, I); 567 Y1 = getSubRegForIndex(Y, Ysub, I); 568 MachineBasicBlock &MBB = *MovT.getParent(); 569 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 570 TII->get(AMDGPU::V_SWAP_B32)) 571 .addDef(X1.Reg, 0, X1.SubReg) 572 .addDef(Y1.Reg, 0, Y1.SubReg) 573 .addReg(Y1.Reg, 0, Y1.SubReg) 574 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 575 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 576 // Drop implicit EXEC. 577 MIB->removeOperand(MIB->getNumExplicitOperands()); 578 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 579 } 580 } 581 MovX->eraseFromParent(); 582 dropInstructionKeepingImpDefs(*MovY); 583 MachineInstr *Next = &*std::next(MovT.getIterator()); 584 585 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 586 dropInstructionKeepingImpDefs(MovT); 587 } else { 588 Xop.setIsKill(false); 589 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 590 unsigned OpNo = MovT.getNumExplicitOperands() + I; 591 const MachineOperand &Op = MovT.getOperand(OpNo); 592 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 593 MovT.removeOperand(OpNo); 594 } 595 } 596 597 return Next; 598 } 599 600 return nullptr; 601 } 602 603 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 604 if (skipFunction(MF.getFunction())) 605 return false; 606 607 MRI = &MF.getRegInfo(); 608 ST = &MF.getSubtarget<GCNSubtarget>(); 609 TII = ST->getInstrInfo(); 610 TRI = &TII->getRegisterInfo(); 611 612 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 613 614 std::vector<unsigned> I1Defs; 615 616 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 617 BI != BE; ++BI) { 618 619 MachineBasicBlock &MBB = *BI; 620 MachineBasicBlock::iterator I, Next; 621 for (I = MBB.begin(); I != MBB.end(); I = Next) { 622 Next = std::next(I); 623 MachineInstr &MI = *I; 624 625 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 626 // If this has a literal constant source that is the same as the 627 // reversed bits of an inline immediate, replace with a bitreverse of 628 // that constant. This saves 4 bytes in the common case of materializing 629 // sign bits. 630 631 // Test if we are after regalloc. We only want to do this after any 632 // optimizations happen because this will confuse them. 633 // XXX - not exactly a check for post-regalloc run. 634 MachineOperand &Src = MI.getOperand(1); 635 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 636 int32_t ReverseImm; 637 if (isReverseInlineImm(Src, ReverseImm)) { 638 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 639 Src.setImm(ReverseImm); 640 continue; 641 } 642 } 643 } 644 645 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 646 MI.getOpcode() == AMDGPU::COPY)) { 647 if (auto *NextMI = matchSwap(MI)) { 648 Next = NextMI->getIterator(); 649 continue; 650 } 651 } 652 653 // FIXME: We also need to consider movs of constant operands since 654 // immediate operands are not folded if they have more than one use, and 655 // the operand folding pass is unaware if the immediate will be free since 656 // it won't know if the src == dest constraint will end up being 657 // satisfied. 658 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 659 MI.getOpcode() == AMDGPU::S_MUL_I32) { 660 const MachineOperand *Dest = &MI.getOperand(0); 661 MachineOperand *Src0 = &MI.getOperand(1); 662 MachineOperand *Src1 = &MI.getOperand(2); 663 664 if (!Src0->isReg() && Src1->isReg()) { 665 if (TII->commuteInstruction(MI, false, 1, 2)) 666 std::swap(Src0, Src1); 667 } 668 669 // FIXME: This could work better if hints worked with subregisters. If 670 // we have a vector add of a constant, we usually don't get the correct 671 // allocation due to the subregister usage. 672 if (Dest->getReg().isVirtual() && Src0->isReg()) { 673 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 674 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 675 continue; 676 } 677 678 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 679 if (Src1->isImm() && isKImmOperand(*Src1)) { 680 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 681 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 682 683 MI.setDesc(TII->get(Opc)); 684 MI.tieOperands(0, 1); 685 } 686 } 687 } 688 689 // Try to use s_cmpk_* 690 if (MI.isCompare() && TII->isSOPC(MI)) { 691 shrinkScalarCompare(MI); 692 continue; 693 } 694 695 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 696 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 697 const MachineOperand &Dst = MI.getOperand(0); 698 MachineOperand &Src = MI.getOperand(1); 699 700 if (Src.isImm() && Dst.getReg().isPhysical()) { 701 int32_t ReverseImm; 702 if (isKImmOperand(Src)) 703 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 704 else if (isReverseInlineImm(Src, ReverseImm)) { 705 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 706 Src.setImm(ReverseImm); 707 } 708 } 709 710 continue; 711 } 712 713 // Shrink scalar logic operations. 714 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 715 MI.getOpcode() == AMDGPU::S_OR_B32 || 716 MI.getOpcode() == AMDGPU::S_XOR_B32) { 717 if (shrinkScalarLogicOp(MI)) 718 continue; 719 } 720 721 if (TII->isMIMG(MI.getOpcode()) && 722 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 723 MF.getProperties().hasProperty( 724 MachineFunctionProperties::Property::NoVRegs)) { 725 shrinkMIMG(MI); 726 continue; 727 } 728 729 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 730 continue; 731 732 if (!TII->canShrink(MI, *MRI)) { 733 // Try commuting the instruction and see if that enables us to shrink 734 // it. 735 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 736 !TII->canShrink(MI, *MRI)) 737 continue; 738 } 739 740 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 741 742 if (TII->isVOPC(Op32)) { 743 MachineOperand &Op0 = MI.getOperand(0); 744 if (Op0.isReg()) { 745 // Exclude VOPCX instructions as these don't explicitly write a 746 // dst. 747 Register DstReg = Op0.getReg(); 748 if (DstReg.isVirtual()) { 749 // VOPC instructions can only write to the VCC register. We can't 750 // force them to use VCC here, because this is only one register and 751 // cannot deal with sequences which would require multiple copies of 752 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 753 // 754 // So, instead of forcing the instruction to write to VCC, we 755 // provide a hint to the register allocator to use VCC and then we 756 // will run this pass again after RA and shrink it if it outputs to 757 // VCC. 758 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 759 continue; 760 } 761 if (DstReg != VCCReg) 762 continue; 763 } 764 } 765 766 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 767 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 768 // instructions. 769 const MachineOperand *Src2 = 770 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 771 if (!Src2->isReg()) 772 continue; 773 Register SReg = Src2->getReg(); 774 if (SReg.isVirtual()) { 775 MRI->setRegAllocationHint(SReg, 0, VCCReg); 776 continue; 777 } 778 if (SReg != VCCReg) 779 continue; 780 } 781 782 // Check for the bool flag output for instructions like V_ADD_I32_e64. 783 const MachineOperand *SDst = TII->getNamedOperand(MI, 784 AMDGPU::OpName::sdst); 785 786 if (SDst) { 787 bool Next = false; 788 789 if (SDst->getReg() != VCCReg) { 790 if (SDst->getReg().isVirtual()) 791 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 792 Next = true; 793 } 794 795 // All of the instructions with carry outs also have an SGPR input in 796 // src2. 797 const MachineOperand *Src2 = TII->getNamedOperand(MI, 798 AMDGPU::OpName::src2); 799 if (Src2 && Src2->getReg() != VCCReg) { 800 if (Src2->getReg().isVirtual()) 801 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 802 Next = true; 803 } 804 805 if (Next) 806 continue; 807 } 808 809 // We can shrink this instruction 810 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 811 812 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 813 ++NumInstructionsShrunk; 814 815 // Copy extra operands not present in the instruction definition. 816 copyExtraImplicitOps(*Inst32, MI); 817 818 // Copy deadness from the old explicit vcc def to the new implicit def. 819 if (SDst && SDst->isDead()) 820 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 821 822 MI.eraseFromParent(); 823 foldImmediates(*Inst32); 824 825 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 826 } 827 } 828 return false; 829 } 830