1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "llvm/ADT/Statistic.h" 15 #include "llvm/CodeGen/MachineFunctionPass.h" 16 17 #define DEBUG_TYPE "si-shrink-instructions" 18 19 STATISTIC(NumInstructionsShrunk, 20 "Number of 64-bit instruction reduced to 32-bit."); 21 STATISTIC(NumLiteralConstantsFolded, 22 "Number of literal constants folded into 32-bit instructions."); 23 24 using namespace llvm; 25 26 namespace { 27 28 class SIShrinkInstructions : public MachineFunctionPass { 29 MachineRegisterInfo *MRI; 30 const GCNSubtarget *ST; 31 const SIInstrInfo *TII; 32 const SIRegisterInfo *TRI; 33 34 public: 35 static char ID; 36 37 public: 38 SIShrinkInstructions() : MachineFunctionPass(ID) { 39 } 40 41 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 42 bool isKImmOperand(const MachineOperand &Src) const; 43 bool isKUImmOperand(const MachineOperand &Src) const; 44 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 45 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; 46 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 47 void shrinkScalarCompare(MachineInstr &MI) const; 48 void shrinkMIMG(MachineInstr &MI) const; 49 void shrinkMadFma(MachineInstr &MI) const; 50 bool shrinkScalarLogicOp(MachineInstr &MI) const; 51 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 52 Register Reg, unsigned SubReg) const; 53 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 54 unsigned SubReg) const; 55 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 56 unsigned SubReg) const; 57 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 58 unsigned I) const; 59 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 60 MachineInstr *matchSwap(MachineInstr &MovT) const; 61 62 bool runOnMachineFunction(MachineFunction &MF) override; 63 64 StringRef getPassName() const override { return "SI Shrink Instructions"; } 65 66 void getAnalysisUsage(AnalysisUsage &AU) const override { 67 AU.setPreservesCFG(); 68 MachineFunctionPass::getAnalysisUsage(AU); 69 } 70 }; 71 72 } // End anonymous namespace. 73 74 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 75 "SI Shrink Instructions", false, false) 76 77 char SIShrinkInstructions::ID = 0; 78 79 FunctionPass *llvm::createSIShrinkInstructionsPass() { 80 return new SIShrinkInstructions(); 81 } 82 83 /// This function checks \p MI for operands defined by a move immediate 84 /// instruction and then folds the literal constant into the instruction if it 85 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 86 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 87 bool TryToCommute) const { 88 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 89 90 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 91 92 // Try to fold Src0 93 MachineOperand &Src0 = MI.getOperand(Src0Idx); 94 if (Src0.isReg()) { 95 Register Reg = Src0.getReg(); 96 if (Reg.isVirtual()) { 97 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 98 if (Def && Def->isMoveImmediate()) { 99 MachineOperand &MovSrc = Def->getOperand(1); 100 bool ConstantFolded = false; 101 102 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 103 if (MovSrc.isImm() && 104 (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { 105 Src0.ChangeToImmediate(MovSrc.getImm()); 106 ConstantFolded = true; 107 } else if (MovSrc.isFI()) { 108 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 109 ConstantFolded = true; 110 } else if (MovSrc.isGlobal()) { 111 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 112 MovSrc.getTargetFlags()); 113 ConstantFolded = true; 114 } 115 } 116 117 if (ConstantFolded) { 118 if (MRI->use_nodbg_empty(Reg)) 119 Def->eraseFromParent(); 120 ++NumLiteralConstantsFolded; 121 return true; 122 } 123 } 124 } 125 } 126 127 // We have failed to fold src0, so commute the instruction and try again. 128 if (TryToCommute && MI.isCommutable()) { 129 if (TII->commuteInstruction(MI)) { 130 if (foldImmediates(MI, false)) 131 return true; 132 133 // Commute back. 134 TII->commuteInstruction(MI); 135 } 136 } 137 138 return false; 139 } 140 141 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 142 return isInt<16>(Src.getImm()) && 143 !TII->isInlineConstant(*Src.getParent(), 144 Src.getParent()->getOperandNo(&Src)); 145 } 146 147 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 148 return isUInt<16>(Src.getImm()) && 149 !TII->isInlineConstant(*Src.getParent(), 150 Src.getParent()->getOperandNo(&Src)); 151 } 152 153 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 154 bool &IsUnsigned) const { 155 if (isInt<16>(Src.getImm())) { 156 IsUnsigned = false; 157 return !TII->isInlineConstant(Src); 158 } 159 160 if (isUInt<16>(Src.getImm())) { 161 IsUnsigned = true; 162 return !TII->isInlineConstant(Src); 163 } 164 165 return false; 166 } 167 168 /// \returns true if the constant in \p Src should be replaced with a bitreverse 169 /// of an inline immediate. 170 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, 171 int32_t &ReverseImm) const { 172 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 173 return false; 174 175 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 176 return ReverseImm >= -16 && ReverseImm <= 64; 177 } 178 179 /// Copy implicit register operands from specified instruction to this 180 /// instruction that are not part of the instruction definition. 181 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 182 MachineInstr &MI) const { 183 MachineFunction &MF = *MI.getMF(); 184 for (unsigned i = MI.getDesc().getNumOperands() + 185 MI.getDesc().getNumImplicitUses() + 186 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 187 i != e; ++i) { 188 const MachineOperand &MO = MI.getOperand(i); 189 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 190 NewMI.addOperand(MF, MO); 191 } 192 } 193 194 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 195 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 196 // get constants on the RHS. 197 if (!MI.getOperand(0).isReg()) 198 TII->commuteInstruction(MI, false, 0, 1); 199 200 // cmpk requires src0 to be a register 201 const MachineOperand &Src0 = MI.getOperand(0); 202 if (!Src0.isReg()) 203 return; 204 205 const MachineOperand &Src1 = MI.getOperand(1); 206 if (!Src1.isImm()) 207 return; 208 209 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 210 if (SOPKOpc == -1) 211 return; 212 213 // eq/ne is special because the imm16 can be treated as signed or unsigned, 214 // and initially selected to the unsigned versions. 215 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 216 bool HasUImm; 217 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 218 if (!HasUImm) { 219 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 220 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 221 } 222 223 MI.setDesc(TII->get(SOPKOpc)); 224 } 225 226 return; 227 } 228 229 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 230 231 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 232 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 233 MI.setDesc(NewDesc); 234 } 235 } 236 237 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 238 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 239 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 240 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 241 return; 242 243 int VAddr0Idx = 244 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 245 unsigned NewAddrDwords = Info->VAddrDwords; 246 const TargetRegisterClass *RC; 247 248 if (Info->VAddrDwords == 2) { 249 RC = &AMDGPU::VReg_64RegClass; 250 } else if (Info->VAddrDwords == 3) { 251 RC = &AMDGPU::VReg_96RegClass; 252 } else if (Info->VAddrDwords == 4) { 253 RC = &AMDGPU::VReg_128RegClass; 254 } else if (Info->VAddrDwords == 5) { 255 RC = &AMDGPU::VReg_160RegClass; 256 } else if (Info->VAddrDwords == 6) { 257 RC = &AMDGPU::VReg_192RegClass; 258 } else if (Info->VAddrDwords == 7) { 259 RC = &AMDGPU::VReg_224RegClass; 260 } else if (Info->VAddrDwords == 8) { 261 RC = &AMDGPU::VReg_256RegClass; 262 } else { 263 RC = &AMDGPU::VReg_512RegClass; 264 NewAddrDwords = 16; 265 } 266 267 unsigned VgprBase = 0; 268 bool IsUndef = true; 269 bool IsKill = NewAddrDwords == Info->VAddrDwords; 270 for (unsigned i = 0; i < Info->VAddrDwords; ++i) { 271 const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); 272 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 273 274 if (i == 0) { 275 VgprBase = Vgpr; 276 } else if (VgprBase + i != Vgpr) 277 return; 278 279 if (!Op.isUndef()) 280 IsUndef = false; 281 if (!Op.isKill()) 282 IsKill = false; 283 } 284 285 if (VgprBase + NewAddrDwords > 256) 286 return; 287 288 // Further check for implicit tied operands - this may be present if TFE is 289 // enabled 290 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 291 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 292 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 293 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 294 int ToUntie = -1; 295 if (TFEVal || LWEVal) { 296 // TFE/LWE is enabled so we need to deal with an implicit tied operand 297 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 298 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 299 MI.getOperand(i).isImplicit()) { 300 // This is the tied operand 301 assert( 302 ToUntie == -1 && 303 "found more than one tied implicit operand when expecting only 1"); 304 ToUntie = i; 305 MI.untieRegOperand(ToUntie); 306 } 307 } 308 } 309 310 unsigned NewOpcode = 311 AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, 312 Info->VDataDwords, NewAddrDwords); 313 MI.setDesc(TII->get(NewOpcode)); 314 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 315 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 316 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 317 318 for (unsigned i = 1; i < Info->VAddrDwords; ++i) 319 MI.removeOperand(VAddr0Idx + 1); 320 321 if (ToUntie >= 0) { 322 MI.tieOperands( 323 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 324 ToUntie - (Info->VAddrDwords - 1)); 325 } 326 } 327 328 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 329 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 330 if (!ST->hasVOP3Literal()) 331 return; 332 333 if (TII->hasAnyModifiersSet(MI)) 334 return; 335 336 const unsigned Opcode = MI.getOpcode(); 337 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 338 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 339 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 340 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 341 342 bool Swap; 343 344 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 345 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 346 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 347 Swap = false; 348 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 349 Swap = true; 350 else 351 return; 352 353 switch (Opcode) { 354 default: 355 llvm_unreachable("Unexpected mad/fma opcode!"); 356 case AMDGPU::V_MAD_F32_e64: 357 NewOpcode = AMDGPU::V_MADAK_F32; 358 break; 359 case AMDGPU::V_FMA_F32_e64: 360 NewOpcode = AMDGPU::V_FMAAK_F32; 361 break; 362 case AMDGPU::V_MAD_F16_e64: 363 NewOpcode = AMDGPU::V_MADAK_F16; 364 break; 365 case AMDGPU::V_FMA_F16_e64: 366 NewOpcode = AMDGPU::V_FMAAK_F16; 367 break; 368 } 369 } 370 371 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 372 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 373 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 374 Swap = false; 375 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 376 Swap = true; 377 else 378 return; 379 380 switch (Opcode) { 381 default: 382 llvm_unreachable("Unexpected mad/fma opcode!"); 383 case AMDGPU::V_MAD_F32_e64: 384 NewOpcode = AMDGPU::V_MADMK_F32; 385 break; 386 case AMDGPU::V_FMA_F32_e64: 387 NewOpcode = AMDGPU::V_FMAMK_F32; 388 break; 389 case AMDGPU::V_MAD_F16_e64: 390 NewOpcode = AMDGPU::V_MADMK_F16; 391 break; 392 case AMDGPU::V_FMA_F16_e64: 393 NewOpcode = AMDGPU::V_FMAMK_F16; 394 break; 395 } 396 } 397 398 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 399 return; 400 401 if (Swap) { 402 // Swap Src0 and Src1 by building a new instruction. 403 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 404 MI.getOperand(0).getReg()) 405 .add(Src1) 406 .add(Src0) 407 .add(Src2) 408 .setMIFlags(MI.getFlags()); 409 MI.eraseFromParent(); 410 } else { 411 TII->removeModOperands(MI); 412 MI.setDesc(TII->get(NewOpcode)); 413 } 414 } 415 416 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 417 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 418 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 419 /// XNOR (as a ^ b == ~(a ^ ~b)). 420 /// \returns true if the caller should continue the machine function iterator 421 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 422 unsigned Opc = MI.getOpcode(); 423 const MachineOperand *Dest = &MI.getOperand(0); 424 MachineOperand *Src0 = &MI.getOperand(1); 425 MachineOperand *Src1 = &MI.getOperand(2); 426 MachineOperand *SrcReg = Src0; 427 MachineOperand *SrcImm = Src1; 428 429 if (!SrcImm->isImm() || 430 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 431 return false; 432 433 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 434 uint32_t NewImm = 0; 435 436 if (Opc == AMDGPU::S_AND_B32) { 437 if (isPowerOf2_32(~Imm)) { 438 NewImm = countTrailingOnes(Imm); 439 Opc = AMDGPU::S_BITSET0_B32; 440 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 441 NewImm = ~Imm; 442 Opc = AMDGPU::S_ANDN2_B32; 443 } 444 } else if (Opc == AMDGPU::S_OR_B32) { 445 if (isPowerOf2_32(Imm)) { 446 NewImm = countTrailingZeros(Imm); 447 Opc = AMDGPU::S_BITSET1_B32; 448 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 449 NewImm = ~Imm; 450 Opc = AMDGPU::S_ORN2_B32; 451 } 452 } else if (Opc == AMDGPU::S_XOR_B32) { 453 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 454 NewImm = ~Imm; 455 Opc = AMDGPU::S_XNOR_B32; 456 } 457 } else { 458 llvm_unreachable("unexpected opcode"); 459 } 460 461 if (NewImm != 0) { 462 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 463 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 464 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 465 return true; 466 } 467 468 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 469 const bool IsUndef = SrcReg->isUndef(); 470 const bool IsKill = SrcReg->isKill(); 471 MI.setDesc(TII->get(Opc)); 472 if (Opc == AMDGPU::S_BITSET0_B32 || 473 Opc == AMDGPU::S_BITSET1_B32) { 474 Src0->ChangeToImmediate(NewImm); 475 // Remove the immediate and add the tied input. 476 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 477 /*isImp*/ false, IsKill, 478 /*isDead*/ false, IsUndef); 479 MI.tieOperands(0, 2); 480 } else { 481 SrcImm->setImm(NewImm); 482 } 483 } 484 } 485 486 return false; 487 } 488 489 // This is the same as MachineInstr::readsRegister/modifiesRegister except 490 // it takes subregs into account. 491 bool SIShrinkInstructions::instAccessReg( 492 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 493 unsigned SubReg) const { 494 for (const MachineOperand &MO : R) { 495 if (!MO.isReg()) 496 continue; 497 498 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 499 if (TRI->regsOverlap(Reg, MO.getReg())) 500 return true; 501 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 502 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 503 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 504 if (Overlap.any()) 505 return true; 506 } 507 } 508 return false; 509 } 510 511 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 512 unsigned SubReg) const { 513 return instAccessReg(MI->uses(), Reg, SubReg); 514 } 515 516 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 517 unsigned SubReg) const { 518 return instAccessReg(MI->defs(), Reg, SubReg); 519 } 520 521 TargetInstrInfo::RegSubRegPair 522 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 523 unsigned I) const { 524 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 525 if (Reg.isPhysical()) { 526 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 527 } else { 528 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 529 } 530 } 531 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 532 } 533 534 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 535 MachineInstr &MI) const { 536 for (unsigned i = MI.getDesc().getNumOperands() + 537 MI.getDesc().getNumImplicitUses() + 538 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 539 i != e; ++i) { 540 const MachineOperand &Op = MI.getOperand(i); 541 if (!Op.isDef()) 542 continue; 543 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 544 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 545 } 546 547 MI.eraseFromParent(); 548 } 549 550 // Match: 551 // mov t, x 552 // mov x, y 553 // mov y, t 554 // 555 // => 556 // 557 // mov t, x (t is potentially dead and move eliminated) 558 // v_swap_b32 x, y 559 // 560 // Returns next valid instruction pointer if was able to create v_swap_b32. 561 // 562 // This shall not be done too early not to prevent possible folding which may 563 // remove matched moves, and this should preferably be done before RA to 564 // release saved registers and also possibly after RA which can insert copies 565 // too. 566 // 567 // This is really just a generic peephole that is not a canonical shrinking, 568 // although requirements match the pass placement and it reduces code size too. 569 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 570 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 571 MovT.getOpcode() == AMDGPU::COPY); 572 573 Register T = MovT.getOperand(0).getReg(); 574 unsigned Tsub = MovT.getOperand(0).getSubReg(); 575 MachineOperand &Xop = MovT.getOperand(1); 576 577 if (!Xop.isReg()) 578 return nullptr; 579 Register X = Xop.getReg(); 580 unsigned Xsub = Xop.getSubReg(); 581 582 unsigned Size = TII->getOpSize(MovT, 0) / 4; 583 584 if (!TRI->isVGPR(*MRI, X)) 585 return nullptr; 586 587 if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) 588 return nullptr; 589 590 const unsigned SearchLimit = 16; 591 unsigned Count = 0; 592 bool KilledT = false; 593 for (auto Iter = std::next(MovT.getIterator()), 594 E = MovT.getParent()->instr_end(); 595 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 596 597 MachineInstr *MovY = &*Iter; 598 KilledT = MovY->killsRegister(T, TRI); 599 600 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 601 MovY->getOpcode() != AMDGPU::COPY) || 602 !MovY->getOperand(1).isReg() || 603 MovY->getOperand(1).getReg() != T || 604 MovY->getOperand(1).getSubReg() != Tsub || 605 MovY->hasRegisterImplicitUseOperand(AMDGPU::M0)) 606 continue; 607 608 Register Y = MovY->getOperand(0).getReg(); 609 unsigned Ysub = MovY->getOperand(0).getSubReg(); 610 611 if (!TRI->isVGPR(*MRI, Y)) 612 continue; 613 614 MachineInstr *MovX = nullptr; 615 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 616 I != IY; ++I) { 617 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 618 instModifiesReg(&*I, T, Tsub) || 619 (MovX && instModifiesReg(&*I, X, Xsub))) { 620 MovX = nullptr; 621 break; 622 } 623 if (!instReadsReg(&*I, Y, Ysub)) { 624 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 625 MovX = nullptr; 626 break; 627 } 628 continue; 629 } 630 if (MovX || 631 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 632 I->getOpcode() != AMDGPU::COPY) || 633 I->getOperand(0).getReg() != X || 634 I->getOperand(0).getSubReg() != Xsub) { 635 MovX = nullptr; 636 break; 637 } 638 // Implicit use of M0 is an indirect move. 639 if (I->hasRegisterImplicitUseOperand(AMDGPU::M0)) 640 continue; 641 642 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 643 continue; 644 645 MovX = &*I; 646 } 647 648 if (!MovX) 649 continue; 650 651 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 652 653 for (unsigned I = 0; I < Size; ++I) { 654 TargetInstrInfo::RegSubRegPair X1, Y1; 655 X1 = getSubRegForIndex(X, Xsub, I); 656 Y1 = getSubRegForIndex(Y, Ysub, I); 657 MachineBasicBlock &MBB = *MovT.getParent(); 658 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 659 TII->get(AMDGPU::V_SWAP_B32)) 660 .addDef(X1.Reg, 0, X1.SubReg) 661 .addDef(Y1.Reg, 0, Y1.SubReg) 662 .addReg(Y1.Reg, 0, Y1.SubReg) 663 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 664 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 665 // Drop implicit EXEC. 666 MIB->removeOperand(MIB->getNumExplicitOperands()); 667 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 668 } 669 } 670 MovX->eraseFromParent(); 671 dropInstructionKeepingImpDefs(*MovY); 672 MachineInstr *Next = &*std::next(MovT.getIterator()); 673 674 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 675 dropInstructionKeepingImpDefs(MovT); 676 } else { 677 Xop.setIsKill(false); 678 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 679 unsigned OpNo = MovT.getNumExplicitOperands() + I; 680 const MachineOperand &Op = MovT.getOperand(OpNo); 681 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 682 MovT.removeOperand(OpNo); 683 } 684 } 685 686 return Next; 687 } 688 689 return nullptr; 690 } 691 692 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 693 if (skipFunction(MF.getFunction())) 694 return false; 695 696 MRI = &MF.getRegInfo(); 697 ST = &MF.getSubtarget<GCNSubtarget>(); 698 TII = ST->getInstrInfo(); 699 TRI = &TII->getRegisterInfo(); 700 701 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 702 703 std::vector<unsigned> I1Defs; 704 705 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 706 BI != BE; ++BI) { 707 708 MachineBasicBlock &MBB = *BI; 709 MachineBasicBlock::iterator I, Next; 710 for (I = MBB.begin(); I != MBB.end(); I = Next) { 711 Next = std::next(I); 712 MachineInstr &MI = *I; 713 714 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 715 // If this has a literal constant source that is the same as the 716 // reversed bits of an inline immediate, replace with a bitreverse of 717 // that constant. This saves 4 bytes in the common case of materializing 718 // sign bits. 719 720 // Test if we are after regalloc. We only want to do this after any 721 // optimizations happen because this will confuse them. 722 // XXX - not exactly a check for post-regalloc run. 723 MachineOperand &Src = MI.getOperand(1); 724 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 725 int32_t ReverseImm; 726 if (isReverseInlineImm(Src, ReverseImm)) { 727 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 728 Src.setImm(ReverseImm); 729 continue; 730 } 731 } 732 } 733 734 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 735 MI.getOpcode() == AMDGPU::COPY)) { 736 if (auto *NextMI = matchSwap(MI)) { 737 Next = NextMI->getIterator(); 738 continue; 739 } 740 } 741 742 // Try to use S_ADDK_I32 and S_MULK_I32. 743 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 744 MI.getOpcode() == AMDGPU::S_MUL_I32) { 745 const MachineOperand *Dest = &MI.getOperand(0); 746 MachineOperand *Src0 = &MI.getOperand(1); 747 MachineOperand *Src1 = &MI.getOperand(2); 748 749 if (!Src0->isReg() && Src1->isReg()) { 750 if (TII->commuteInstruction(MI, false, 1, 2)) 751 std::swap(Src0, Src1); 752 } 753 754 // FIXME: This could work better if hints worked with subregisters. If 755 // we have a vector add of a constant, we usually don't get the correct 756 // allocation due to the subregister usage. 757 if (Dest->getReg().isVirtual() && Src0->isReg()) { 758 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 759 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 760 continue; 761 } 762 763 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 764 if (Src1->isImm() && isKImmOperand(*Src1)) { 765 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 766 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 767 768 MI.setDesc(TII->get(Opc)); 769 MI.tieOperands(0, 1); 770 } 771 } 772 } 773 774 // Try to use s_cmpk_* 775 if (MI.isCompare() && TII->isSOPC(MI)) { 776 shrinkScalarCompare(MI); 777 continue; 778 } 779 780 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 781 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 782 const MachineOperand &Dst = MI.getOperand(0); 783 MachineOperand &Src = MI.getOperand(1); 784 785 if (Src.isImm() && Dst.getReg().isPhysical()) { 786 int32_t ReverseImm; 787 if (isKImmOperand(Src)) 788 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 789 else if (isReverseInlineImm(Src, ReverseImm)) { 790 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 791 Src.setImm(ReverseImm); 792 } 793 } 794 795 continue; 796 } 797 798 // Shrink scalar logic operations. 799 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 800 MI.getOpcode() == AMDGPU::S_OR_B32 || 801 MI.getOpcode() == AMDGPU::S_XOR_B32) { 802 if (shrinkScalarLogicOp(MI)) 803 continue; 804 } 805 806 if (TII->isMIMG(MI.getOpcode()) && 807 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 808 MF.getProperties().hasProperty( 809 MachineFunctionProperties::Property::NoVRegs)) { 810 shrinkMIMG(MI); 811 continue; 812 } 813 814 if (!TII->isVOP3(MI)) 815 continue; 816 817 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 818 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 819 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 820 MI.getOpcode() == AMDGPU::V_FMA_F16_e64) { 821 shrinkMadFma(MI); 822 continue; 823 } 824 825 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 826 continue; 827 828 if (!TII->canShrink(MI, *MRI)) { 829 // Try commuting the instruction and see if that enables us to shrink 830 // it. 831 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 832 !TII->canShrink(MI, *MRI)) 833 continue; 834 } 835 836 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 837 838 if (TII->isVOPC(Op32)) { 839 MachineOperand &Op0 = MI.getOperand(0); 840 if (Op0.isReg()) { 841 // Exclude VOPCX instructions as these don't explicitly write a 842 // dst. 843 Register DstReg = Op0.getReg(); 844 if (DstReg.isVirtual()) { 845 // VOPC instructions can only write to the VCC register. We can't 846 // force them to use VCC here, because this is only one register and 847 // cannot deal with sequences which would require multiple copies of 848 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 849 // 850 // So, instead of forcing the instruction to write to VCC, we 851 // provide a hint to the register allocator to use VCC and then we 852 // will run this pass again after RA and shrink it if it outputs to 853 // VCC. 854 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 855 continue; 856 } 857 if (DstReg != VCCReg) 858 continue; 859 } 860 } 861 862 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 863 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 864 // instructions. 865 const MachineOperand *Src2 = 866 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 867 if (!Src2->isReg()) 868 continue; 869 Register SReg = Src2->getReg(); 870 if (SReg.isVirtual()) { 871 MRI->setRegAllocationHint(SReg, 0, VCCReg); 872 continue; 873 } 874 if (SReg != VCCReg) 875 continue; 876 } 877 878 // Check for the bool flag output for instructions like V_ADD_I32_e64. 879 const MachineOperand *SDst = TII->getNamedOperand(MI, 880 AMDGPU::OpName::sdst); 881 882 if (SDst) { 883 bool Next = false; 884 885 if (SDst->getReg() != VCCReg) { 886 if (SDst->getReg().isVirtual()) 887 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 888 Next = true; 889 } 890 891 // All of the instructions with carry outs also have an SGPR input in 892 // src2. 893 const MachineOperand *Src2 = TII->getNamedOperand(MI, 894 AMDGPU::OpName::src2); 895 if (Src2 && Src2->getReg() != VCCReg) { 896 if (Src2->getReg().isVirtual()) 897 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 898 Next = true; 899 } 900 901 if (Next) 902 continue; 903 } 904 905 // We can shrink this instruction 906 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 907 908 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 909 ++NumInstructionsShrunk; 910 911 // Copy extra operands not present in the instruction definition. 912 copyExtraImplicitOps(*Inst32, MI); 913 914 // Copy deadness from the old explicit vcc def to the new implicit def. 915 if (SDst && SDst->isDead()) 916 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 917 918 MI.eraseFromParent(); 919 foldImmediates(*Inst32); 920 921 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 922 } 923 } 924 return false; 925 } 926