1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "llvm/ADT/Statistic.h" 15 #include "llvm/CodeGen/MachineFunctionPass.h" 16 17 #define DEBUG_TYPE "si-shrink-instructions" 18 19 STATISTIC(NumInstructionsShrunk, 20 "Number of 64-bit instruction reduced to 32-bit."); 21 STATISTIC(NumLiteralConstantsFolded, 22 "Number of literal constants folded into 32-bit instructions."); 23 24 using namespace llvm; 25 26 namespace { 27 28 class SIShrinkInstructions : public MachineFunctionPass { 29 MachineRegisterInfo *MRI; 30 const GCNSubtarget *ST; 31 const SIInstrInfo *TII; 32 const SIRegisterInfo *TRI; 33 34 public: 35 static char ID; 36 37 public: 38 SIShrinkInstructions() : MachineFunctionPass(ID) { 39 } 40 41 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 42 bool isKImmOperand(const MachineOperand &Src) const; 43 bool isKUImmOperand(const MachineOperand &Src) const; 44 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 45 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; 46 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 47 void shrinkScalarCompare(MachineInstr &MI) const; 48 void shrinkMIMG(MachineInstr &MI) const; 49 void shrinkMadFma(MachineInstr &MI) const; 50 bool shrinkScalarLogicOp(MachineInstr &MI) const; 51 bool tryReplaceDeadSDST(MachineInstr &MI) const; 52 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 53 Register Reg, unsigned SubReg) const; 54 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 55 unsigned SubReg) const; 56 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 57 unsigned SubReg) const; 58 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 59 unsigned I) const; 60 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 61 MachineInstr *matchSwap(MachineInstr &MovT) const; 62 63 bool runOnMachineFunction(MachineFunction &MF) override; 64 65 StringRef getPassName() const override { return "SI Shrink Instructions"; } 66 67 void getAnalysisUsage(AnalysisUsage &AU) const override { 68 AU.setPreservesCFG(); 69 MachineFunctionPass::getAnalysisUsage(AU); 70 } 71 }; 72 73 } // End anonymous namespace. 74 75 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 76 "SI Shrink Instructions", false, false) 77 78 char SIShrinkInstructions::ID = 0; 79 80 FunctionPass *llvm::createSIShrinkInstructionsPass() { 81 return new SIShrinkInstructions(); 82 } 83 84 /// This function checks \p MI for operands defined by a move immediate 85 /// instruction and then folds the literal constant into the instruction if it 86 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 87 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 88 bool TryToCommute) const { 89 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 90 91 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 92 93 // Try to fold Src0 94 MachineOperand &Src0 = MI.getOperand(Src0Idx); 95 if (Src0.isReg()) { 96 Register Reg = Src0.getReg(); 97 if (Reg.isVirtual()) { 98 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 99 if (Def && Def->isMoveImmediate()) { 100 MachineOperand &MovSrc = Def->getOperand(1); 101 bool ConstantFolded = false; 102 103 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 104 if (MovSrc.isImm() && 105 (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { 106 Src0.ChangeToImmediate(MovSrc.getImm()); 107 ConstantFolded = true; 108 } else if (MovSrc.isFI()) { 109 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 110 ConstantFolded = true; 111 } else if (MovSrc.isGlobal()) { 112 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 113 MovSrc.getTargetFlags()); 114 ConstantFolded = true; 115 } 116 } 117 118 if (ConstantFolded) { 119 if (MRI->use_nodbg_empty(Reg)) 120 Def->eraseFromParent(); 121 ++NumLiteralConstantsFolded; 122 return true; 123 } 124 } 125 } 126 } 127 128 // We have failed to fold src0, so commute the instruction and try again. 129 if (TryToCommute && MI.isCommutable()) { 130 if (TII->commuteInstruction(MI)) { 131 if (foldImmediates(MI, false)) 132 return true; 133 134 // Commute back. 135 TII->commuteInstruction(MI); 136 } 137 } 138 139 return false; 140 } 141 142 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 143 return isInt<16>(Src.getImm()) && 144 !TII->isInlineConstant(*Src.getParent(), 145 Src.getParent()->getOperandNo(&Src)); 146 } 147 148 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 149 return isUInt<16>(Src.getImm()) && 150 !TII->isInlineConstant(*Src.getParent(), 151 Src.getParent()->getOperandNo(&Src)); 152 } 153 154 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 155 bool &IsUnsigned) const { 156 if (isInt<16>(Src.getImm())) { 157 IsUnsigned = false; 158 return !TII->isInlineConstant(Src); 159 } 160 161 if (isUInt<16>(Src.getImm())) { 162 IsUnsigned = true; 163 return !TII->isInlineConstant(Src); 164 } 165 166 return false; 167 } 168 169 /// \returns true if the constant in \p Src should be replaced with a bitreverse 170 /// of an inline immediate. 171 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, 172 int32_t &ReverseImm) const { 173 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 174 return false; 175 176 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 177 return ReverseImm >= -16 && ReverseImm <= 64; 178 } 179 180 /// Copy implicit register operands from specified instruction to this 181 /// instruction that are not part of the instruction definition. 182 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 183 MachineInstr &MI) const { 184 MachineFunction &MF = *MI.getMF(); 185 for (unsigned i = MI.getDesc().getNumOperands() + 186 MI.getDesc().getNumImplicitUses() + 187 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 188 i != e; ++i) { 189 const MachineOperand &MO = MI.getOperand(i); 190 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 191 NewMI.addOperand(MF, MO); 192 } 193 } 194 195 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 196 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 197 // get constants on the RHS. 198 if (!MI.getOperand(0).isReg()) 199 TII->commuteInstruction(MI, false, 0, 1); 200 201 // cmpk requires src0 to be a register 202 const MachineOperand &Src0 = MI.getOperand(0); 203 if (!Src0.isReg()) 204 return; 205 206 const MachineOperand &Src1 = MI.getOperand(1); 207 if (!Src1.isImm()) 208 return; 209 210 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 211 if (SOPKOpc == -1) 212 return; 213 214 // eq/ne is special because the imm16 can be treated as signed or unsigned, 215 // and initially selected to the unsigned versions. 216 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 217 bool HasUImm; 218 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 219 if (!HasUImm) { 220 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 221 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 222 } 223 224 MI.setDesc(TII->get(SOPKOpc)); 225 } 226 227 return; 228 } 229 230 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 231 232 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 233 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 234 MI.setDesc(NewDesc); 235 } 236 } 237 238 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 239 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 240 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 241 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 242 return; 243 244 int VAddr0Idx = 245 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 246 unsigned NewAddrDwords = Info->VAddrDwords; 247 const TargetRegisterClass *RC; 248 249 if (Info->VAddrDwords == 2) { 250 RC = &AMDGPU::VReg_64RegClass; 251 } else if (Info->VAddrDwords == 3) { 252 RC = &AMDGPU::VReg_96RegClass; 253 } else if (Info->VAddrDwords == 4) { 254 RC = &AMDGPU::VReg_128RegClass; 255 } else if (Info->VAddrDwords == 5) { 256 RC = &AMDGPU::VReg_160RegClass; 257 } else if (Info->VAddrDwords == 6) { 258 RC = &AMDGPU::VReg_192RegClass; 259 } else if (Info->VAddrDwords == 7) { 260 RC = &AMDGPU::VReg_224RegClass; 261 } else if (Info->VAddrDwords == 8) { 262 RC = &AMDGPU::VReg_256RegClass; 263 } else { 264 RC = &AMDGPU::VReg_512RegClass; 265 NewAddrDwords = 16; 266 } 267 268 unsigned VgprBase = 0; 269 bool IsUndef = true; 270 bool IsKill = NewAddrDwords == Info->VAddrDwords; 271 for (unsigned i = 0; i < Info->VAddrDwords; ++i) { 272 const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); 273 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 274 275 if (i == 0) { 276 VgprBase = Vgpr; 277 } else if (VgprBase + i != Vgpr) 278 return; 279 280 if (!Op.isUndef()) 281 IsUndef = false; 282 if (!Op.isKill()) 283 IsKill = false; 284 } 285 286 if (VgprBase + NewAddrDwords > 256) 287 return; 288 289 // Further check for implicit tied operands - this may be present if TFE is 290 // enabled 291 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 292 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 293 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 294 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 295 int ToUntie = -1; 296 if (TFEVal || LWEVal) { 297 // TFE/LWE is enabled so we need to deal with an implicit tied operand 298 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 299 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 300 MI.getOperand(i).isImplicit()) { 301 // This is the tied operand 302 assert( 303 ToUntie == -1 && 304 "found more than one tied implicit operand when expecting only 1"); 305 ToUntie = i; 306 MI.untieRegOperand(ToUntie); 307 } 308 } 309 } 310 311 unsigned NewOpcode = 312 AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, 313 Info->VDataDwords, NewAddrDwords); 314 MI.setDesc(TII->get(NewOpcode)); 315 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 316 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 317 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 318 319 for (unsigned i = 1; i < Info->VAddrDwords; ++i) 320 MI.removeOperand(VAddr0Idx + 1); 321 322 if (ToUntie >= 0) { 323 MI.tieOperands( 324 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 325 ToUntie - (Info->VAddrDwords - 1)); 326 } 327 } 328 329 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 330 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 331 if (!ST->hasVOP3Literal()) 332 return; 333 334 if (TII->hasAnyModifiersSet(MI)) 335 return; 336 337 const unsigned Opcode = MI.getOpcode(); 338 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 339 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 340 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 341 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 342 343 bool Swap; 344 345 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 346 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 347 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 348 Swap = false; 349 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 350 Swap = true; 351 else 352 return; 353 354 switch (Opcode) { 355 default: 356 llvm_unreachable("Unexpected mad/fma opcode!"); 357 case AMDGPU::V_MAD_F32_e64: 358 NewOpcode = AMDGPU::V_MADAK_F32; 359 break; 360 case AMDGPU::V_FMA_F32_e64: 361 NewOpcode = AMDGPU::V_FMAAK_F32; 362 break; 363 case AMDGPU::V_MAD_F16_e64: 364 NewOpcode = AMDGPU::V_MADAK_F16; 365 break; 366 case AMDGPU::V_FMA_F16_e64: 367 NewOpcode = AMDGPU::V_FMAAK_F16; 368 break; 369 } 370 } 371 372 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 373 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 374 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 375 Swap = false; 376 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 377 Swap = true; 378 else 379 return; 380 381 switch (Opcode) { 382 default: 383 llvm_unreachable("Unexpected mad/fma opcode!"); 384 case AMDGPU::V_MAD_F32_e64: 385 NewOpcode = AMDGPU::V_MADMK_F32; 386 break; 387 case AMDGPU::V_FMA_F32_e64: 388 NewOpcode = AMDGPU::V_FMAMK_F32; 389 break; 390 case AMDGPU::V_MAD_F16_e64: 391 NewOpcode = AMDGPU::V_MADMK_F16; 392 break; 393 case AMDGPU::V_FMA_F16_e64: 394 NewOpcode = AMDGPU::V_FMAMK_F16; 395 break; 396 } 397 } 398 399 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 400 return; 401 402 if (Swap) { 403 // Swap Src0 and Src1 by building a new instruction. 404 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 405 MI.getOperand(0).getReg()) 406 .add(Src1) 407 .add(Src0) 408 .add(Src2) 409 .setMIFlags(MI.getFlags()); 410 MI.eraseFromParent(); 411 } else { 412 TII->removeModOperands(MI); 413 MI.setDesc(TII->get(NewOpcode)); 414 } 415 } 416 417 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 418 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 419 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 420 /// XNOR (as a ^ b == ~(a ^ ~b)). 421 /// \returns true if the caller should continue the machine function iterator 422 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 423 unsigned Opc = MI.getOpcode(); 424 const MachineOperand *Dest = &MI.getOperand(0); 425 MachineOperand *Src0 = &MI.getOperand(1); 426 MachineOperand *Src1 = &MI.getOperand(2); 427 MachineOperand *SrcReg = Src0; 428 MachineOperand *SrcImm = Src1; 429 430 if (!SrcImm->isImm() || 431 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 432 return false; 433 434 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 435 uint32_t NewImm = 0; 436 437 if (Opc == AMDGPU::S_AND_B32) { 438 if (isPowerOf2_32(~Imm)) { 439 NewImm = countTrailingOnes(Imm); 440 Opc = AMDGPU::S_BITSET0_B32; 441 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 442 NewImm = ~Imm; 443 Opc = AMDGPU::S_ANDN2_B32; 444 } 445 } else if (Opc == AMDGPU::S_OR_B32) { 446 if (isPowerOf2_32(Imm)) { 447 NewImm = countTrailingZeros(Imm); 448 Opc = AMDGPU::S_BITSET1_B32; 449 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 450 NewImm = ~Imm; 451 Opc = AMDGPU::S_ORN2_B32; 452 } 453 } else if (Opc == AMDGPU::S_XOR_B32) { 454 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 455 NewImm = ~Imm; 456 Opc = AMDGPU::S_XNOR_B32; 457 } 458 } else { 459 llvm_unreachable("unexpected opcode"); 460 } 461 462 if (NewImm != 0) { 463 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 464 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 465 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 466 return true; 467 } 468 469 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 470 const bool IsUndef = SrcReg->isUndef(); 471 const bool IsKill = SrcReg->isKill(); 472 MI.setDesc(TII->get(Opc)); 473 if (Opc == AMDGPU::S_BITSET0_B32 || 474 Opc == AMDGPU::S_BITSET1_B32) { 475 Src0->ChangeToImmediate(NewImm); 476 // Remove the immediate and add the tied input. 477 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 478 /*isImp*/ false, IsKill, 479 /*isDead*/ false, IsUndef); 480 MI.tieOperands(0, 2); 481 } else { 482 SrcImm->setImm(NewImm); 483 } 484 } 485 } 486 487 return false; 488 } 489 490 // This is the same as MachineInstr::readsRegister/modifiesRegister except 491 // it takes subregs into account. 492 bool SIShrinkInstructions::instAccessReg( 493 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 494 unsigned SubReg) const { 495 for (const MachineOperand &MO : R) { 496 if (!MO.isReg()) 497 continue; 498 499 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 500 if (TRI->regsOverlap(Reg, MO.getReg())) 501 return true; 502 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 503 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 504 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 505 if (Overlap.any()) 506 return true; 507 } 508 } 509 return false; 510 } 511 512 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 513 unsigned SubReg) const { 514 return instAccessReg(MI->uses(), Reg, SubReg); 515 } 516 517 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 518 unsigned SubReg) const { 519 return instAccessReg(MI->defs(), Reg, SubReg); 520 } 521 522 TargetInstrInfo::RegSubRegPair 523 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 524 unsigned I) const { 525 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 526 if (Reg.isPhysical()) { 527 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 528 } else { 529 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 530 } 531 } 532 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 533 } 534 535 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 536 MachineInstr &MI) const { 537 for (unsigned i = MI.getDesc().getNumOperands() + 538 MI.getDesc().getNumImplicitUses() + 539 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 540 i != e; ++i) { 541 const MachineOperand &Op = MI.getOperand(i); 542 if (!Op.isDef()) 543 continue; 544 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 545 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 546 } 547 548 MI.eraseFromParent(); 549 } 550 551 // Match: 552 // mov t, x 553 // mov x, y 554 // mov y, t 555 // 556 // => 557 // 558 // mov t, x (t is potentially dead and move eliminated) 559 // v_swap_b32 x, y 560 // 561 // Returns next valid instruction pointer if was able to create v_swap_b32. 562 // 563 // This shall not be done too early not to prevent possible folding which may 564 // remove matched moves, and this should preferably be done before RA to 565 // release saved registers and also possibly after RA which can insert copies 566 // too. 567 // 568 // This is really just a generic peephole that is not a canonical shrinking, 569 // although requirements match the pass placement and it reduces code size too. 570 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 571 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 572 MovT.getOpcode() == AMDGPU::COPY); 573 574 Register T = MovT.getOperand(0).getReg(); 575 unsigned Tsub = MovT.getOperand(0).getSubReg(); 576 MachineOperand &Xop = MovT.getOperand(1); 577 578 if (!Xop.isReg()) 579 return nullptr; 580 Register X = Xop.getReg(); 581 unsigned Xsub = Xop.getSubReg(); 582 583 unsigned Size = TII->getOpSize(MovT, 0) / 4; 584 585 if (!TRI->isVGPR(*MRI, X)) 586 return nullptr; 587 588 if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) 589 return nullptr; 590 591 const unsigned SearchLimit = 16; 592 unsigned Count = 0; 593 bool KilledT = false; 594 for (auto Iter = std::next(MovT.getIterator()), 595 E = MovT.getParent()->instr_end(); 596 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 597 598 MachineInstr *MovY = &*Iter; 599 KilledT = MovY->killsRegister(T, TRI); 600 601 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 602 MovY->getOpcode() != AMDGPU::COPY) || 603 !MovY->getOperand(1).isReg() || 604 MovY->getOperand(1).getReg() != T || 605 MovY->getOperand(1).getSubReg() != Tsub || 606 MovY->hasRegisterImplicitUseOperand(AMDGPU::M0)) 607 continue; 608 609 Register Y = MovY->getOperand(0).getReg(); 610 unsigned Ysub = MovY->getOperand(0).getSubReg(); 611 612 if (!TRI->isVGPR(*MRI, Y)) 613 continue; 614 615 MachineInstr *MovX = nullptr; 616 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 617 I != IY; ++I) { 618 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 619 instModifiesReg(&*I, T, Tsub) || 620 (MovX && instModifiesReg(&*I, X, Xsub))) { 621 MovX = nullptr; 622 break; 623 } 624 if (!instReadsReg(&*I, Y, Ysub)) { 625 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 626 MovX = nullptr; 627 break; 628 } 629 continue; 630 } 631 if (MovX || 632 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 633 I->getOpcode() != AMDGPU::COPY) || 634 I->getOperand(0).getReg() != X || 635 I->getOperand(0).getSubReg() != Xsub) { 636 MovX = nullptr; 637 break; 638 } 639 // Implicit use of M0 is an indirect move. 640 if (I->hasRegisterImplicitUseOperand(AMDGPU::M0)) 641 continue; 642 643 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 644 continue; 645 646 MovX = &*I; 647 } 648 649 if (!MovX) 650 continue; 651 652 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 653 654 for (unsigned I = 0; I < Size; ++I) { 655 TargetInstrInfo::RegSubRegPair X1, Y1; 656 X1 = getSubRegForIndex(X, Xsub, I); 657 Y1 = getSubRegForIndex(Y, Ysub, I); 658 MachineBasicBlock &MBB = *MovT.getParent(); 659 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 660 TII->get(AMDGPU::V_SWAP_B32)) 661 .addDef(X1.Reg, 0, X1.SubReg) 662 .addDef(Y1.Reg, 0, Y1.SubReg) 663 .addReg(Y1.Reg, 0, Y1.SubReg) 664 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 665 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 666 // Drop implicit EXEC. 667 MIB->removeOperand(MIB->getNumExplicitOperands()); 668 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 669 } 670 } 671 MovX->eraseFromParent(); 672 dropInstructionKeepingImpDefs(*MovY); 673 MachineInstr *Next = &*std::next(MovT.getIterator()); 674 675 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 676 dropInstructionKeepingImpDefs(MovT); 677 } else { 678 Xop.setIsKill(false); 679 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 680 unsigned OpNo = MovT.getNumExplicitOperands() + I; 681 const MachineOperand &Op = MovT.getOperand(OpNo); 682 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 683 MovT.removeOperand(OpNo); 684 } 685 } 686 687 return Next; 688 } 689 690 return nullptr; 691 } 692 693 // If an instruction has dead sdst replace it with NULL register on gfx10+ 694 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 695 if (ST->getGeneration() < AMDGPUSubtarget::GFX10) 696 return false; 697 698 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 699 if (!Op) 700 return false; 701 Register SDstReg = Op->getReg(); 702 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 703 return false; 704 705 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 706 return true; 707 } 708 709 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 710 if (skipFunction(MF.getFunction())) 711 return false; 712 713 MRI = &MF.getRegInfo(); 714 ST = &MF.getSubtarget<GCNSubtarget>(); 715 TII = ST->getInstrInfo(); 716 TRI = &TII->getRegisterInfo(); 717 718 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 719 720 std::vector<unsigned> I1Defs; 721 722 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 723 BI != BE; ++BI) { 724 725 MachineBasicBlock &MBB = *BI; 726 MachineBasicBlock::iterator I, Next; 727 for (I = MBB.begin(); I != MBB.end(); I = Next) { 728 Next = std::next(I); 729 MachineInstr &MI = *I; 730 731 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 732 // If this has a literal constant source that is the same as the 733 // reversed bits of an inline immediate, replace with a bitreverse of 734 // that constant. This saves 4 bytes in the common case of materializing 735 // sign bits. 736 737 // Test if we are after regalloc. We only want to do this after any 738 // optimizations happen because this will confuse them. 739 // XXX - not exactly a check for post-regalloc run. 740 MachineOperand &Src = MI.getOperand(1); 741 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 742 int32_t ReverseImm; 743 if (isReverseInlineImm(Src, ReverseImm)) { 744 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 745 Src.setImm(ReverseImm); 746 continue; 747 } 748 } 749 } 750 751 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 752 MI.getOpcode() == AMDGPU::COPY)) { 753 if (auto *NextMI = matchSwap(MI)) { 754 Next = NextMI->getIterator(); 755 continue; 756 } 757 } 758 759 // Try to use S_ADDK_I32 and S_MULK_I32. 760 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 761 MI.getOpcode() == AMDGPU::S_MUL_I32) { 762 const MachineOperand *Dest = &MI.getOperand(0); 763 MachineOperand *Src0 = &MI.getOperand(1); 764 MachineOperand *Src1 = &MI.getOperand(2); 765 766 if (!Src0->isReg() && Src1->isReg()) { 767 if (TII->commuteInstruction(MI, false, 1, 2)) 768 std::swap(Src0, Src1); 769 } 770 771 // FIXME: This could work better if hints worked with subregisters. If 772 // we have a vector add of a constant, we usually don't get the correct 773 // allocation due to the subregister usage. 774 if (Dest->getReg().isVirtual() && Src0->isReg()) { 775 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 776 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 777 continue; 778 } 779 780 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 781 if (Src1->isImm() && isKImmOperand(*Src1)) { 782 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 783 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 784 785 MI.setDesc(TII->get(Opc)); 786 MI.tieOperands(0, 1); 787 } 788 } 789 } 790 791 // Try to use s_cmpk_* 792 if (MI.isCompare() && TII->isSOPC(MI)) { 793 shrinkScalarCompare(MI); 794 continue; 795 } 796 797 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 798 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 799 const MachineOperand &Dst = MI.getOperand(0); 800 MachineOperand &Src = MI.getOperand(1); 801 802 if (Src.isImm() && Dst.getReg().isPhysical()) { 803 int32_t ReverseImm; 804 if (isKImmOperand(Src)) 805 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 806 else if (isReverseInlineImm(Src, ReverseImm)) { 807 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 808 Src.setImm(ReverseImm); 809 } 810 } 811 812 continue; 813 } 814 815 // Shrink scalar logic operations. 816 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 817 MI.getOpcode() == AMDGPU::S_OR_B32 || 818 MI.getOpcode() == AMDGPU::S_XOR_B32) { 819 if (shrinkScalarLogicOp(MI)) 820 continue; 821 } 822 823 if (TII->isMIMG(MI.getOpcode()) && 824 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 825 MF.getProperties().hasProperty( 826 MachineFunctionProperties::Property::NoVRegs)) { 827 shrinkMIMG(MI); 828 continue; 829 } 830 831 if (!TII->isVOP3(MI)) 832 continue; 833 834 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 835 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 836 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 837 MI.getOpcode() == AMDGPU::V_FMA_F16_e64) { 838 shrinkMadFma(MI); 839 continue; 840 } 841 842 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 843 // If there is no chance we will shrink it and use VCC as sdst to get 844 // a 32 bit form try to replace dead sdst with NULL. 845 tryReplaceDeadSDST(MI); 846 continue; 847 } 848 849 if (!TII->canShrink(MI, *MRI)) { 850 // Try commuting the instruction and see if that enables us to shrink 851 // it. 852 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 853 !TII->canShrink(MI, *MRI)) { 854 tryReplaceDeadSDST(MI); 855 continue; 856 } 857 } 858 859 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 860 861 if (TII->isVOPC(Op32)) { 862 MachineOperand &Op0 = MI.getOperand(0); 863 if (Op0.isReg()) { 864 // Exclude VOPCX instructions as these don't explicitly write a 865 // dst. 866 Register DstReg = Op0.getReg(); 867 if (DstReg.isVirtual()) { 868 // VOPC instructions can only write to the VCC register. We can't 869 // force them to use VCC here, because this is only one register and 870 // cannot deal with sequences which would require multiple copies of 871 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 872 // 873 // So, instead of forcing the instruction to write to VCC, we 874 // provide a hint to the register allocator to use VCC and then we 875 // will run this pass again after RA and shrink it if it outputs to 876 // VCC. 877 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 878 continue; 879 } 880 if (DstReg != VCCReg) 881 continue; 882 } 883 } 884 885 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 886 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 887 // instructions. 888 const MachineOperand *Src2 = 889 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 890 if (!Src2->isReg()) 891 continue; 892 Register SReg = Src2->getReg(); 893 if (SReg.isVirtual()) { 894 MRI->setRegAllocationHint(SReg, 0, VCCReg); 895 continue; 896 } 897 if (SReg != VCCReg) 898 continue; 899 } 900 901 // Check for the bool flag output for instructions like V_ADD_I32_e64. 902 const MachineOperand *SDst = TII->getNamedOperand(MI, 903 AMDGPU::OpName::sdst); 904 905 if (SDst) { 906 bool Next = false; 907 908 if (SDst->getReg() != VCCReg) { 909 if (SDst->getReg().isVirtual()) 910 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 911 Next = true; 912 } 913 914 // All of the instructions with carry outs also have an SGPR input in 915 // src2. 916 const MachineOperand *Src2 = TII->getNamedOperand(MI, 917 AMDGPU::OpName::src2); 918 if (Src2 && Src2->getReg() != VCCReg) { 919 if (Src2->getReg().isVirtual()) 920 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 921 Next = true; 922 } 923 924 if (Next) 925 continue; 926 } 927 928 // We can shrink this instruction 929 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 930 931 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 932 ++NumInstructionsShrunk; 933 934 // Copy extra operands not present in the instruction definition. 935 copyExtraImplicitOps(*Inst32, MI); 936 937 // Copy deadness from the old explicit vcc def to the new implicit def. 938 if (SDst && SDst->isDead()) 939 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 940 941 MI.eraseFromParent(); 942 foldImmediates(*Inst32); 943 944 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 945 } 946 } 947 return false; 948 } 949