1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //===----------------------------------------------------------------------===// 10 // 11 12 #include "AMDGPU.h" 13 #include "AMDGPUSubtarget.h" 14 #include "SIInstrInfo.h" 15 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/MachineInstrBuilder.h" 18 #include "llvm/CodeGen/MachineRegisterInfo.h" 19 #include "llvm/Support/Debug.h" 20 #include "llvm/Support/raw_ostream.h" 21 #include "llvm/Target/TargetMachine.h" 22 23 #define DEBUG_TYPE "si-fold-operands" 24 using namespace llvm; 25 26 namespace { 27 28 class SIFoldOperands : public MachineFunctionPass { 29 public: 30 static char ID; 31 32 public: 33 SIFoldOperands() : MachineFunctionPass(ID) { 34 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); 35 } 36 37 bool runOnMachineFunction(MachineFunction &MF) override; 38 39 StringRef getPassName() const override { return "SI Fold Operands"; } 40 41 void getAnalysisUsage(AnalysisUsage &AU) const override { 42 AU.setPreservesCFG(); 43 MachineFunctionPass::getAnalysisUsage(AU); 44 } 45 }; 46 47 struct FoldCandidate { 48 MachineInstr *UseMI; 49 union { 50 MachineOperand *OpToFold; 51 uint64_t ImmToFold; 52 int FrameIndexToFold; 53 }; 54 unsigned char UseOpNo; 55 MachineOperand::MachineOperandType Kind; 56 57 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : 58 UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { 59 if (FoldOp->isImm()) { 60 ImmToFold = FoldOp->getImm(); 61 } else if (FoldOp->isFI()) { 62 FrameIndexToFold = FoldOp->getIndex(); 63 } else { 64 assert(FoldOp->isReg()); 65 OpToFold = FoldOp; 66 } 67 } 68 69 bool isFI() const { 70 return Kind == MachineOperand::MO_FrameIndex; 71 } 72 73 bool isImm() const { 74 return Kind == MachineOperand::MO_Immediate; 75 } 76 77 bool isReg() const { 78 return Kind == MachineOperand::MO_Register; 79 } 80 }; 81 82 } // End anonymous namespace. 83 84 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, 85 "SI Fold Operands", false, false) 86 87 char SIFoldOperands::ID = 0; 88 89 char &llvm::SIFoldOperandsID = SIFoldOperands::ID; 90 91 FunctionPass *llvm::createSIFoldOperandsPass() { 92 return new SIFoldOperands(); 93 } 94 95 static bool isSafeToFold(const MachineInstr &MI) { 96 switch (MI.getOpcode()) { 97 case AMDGPU::V_MOV_B32_e32: 98 case AMDGPU::V_MOV_B32_e64: 99 case AMDGPU::V_MOV_B64_PSEUDO: { 100 // If there are additional implicit register operands, this may be used for 101 // register indexing so the source register operand isn't simply copied. 102 unsigned NumOps = MI.getDesc().getNumOperands() + 103 MI.getDesc().getNumImplicitUses(); 104 105 return MI.getNumOperands() == NumOps; 106 } 107 case AMDGPU::S_MOV_B32: 108 case AMDGPU::S_MOV_B64: 109 case AMDGPU::COPY: 110 return true; 111 default: 112 return false; 113 } 114 } 115 116 static bool updateOperand(FoldCandidate &Fold, 117 const TargetRegisterInfo &TRI) { 118 MachineInstr *MI = Fold.UseMI; 119 MachineOperand &Old = MI->getOperand(Fold.UseOpNo); 120 assert(Old.isReg()); 121 122 if (Fold.isImm()) { 123 Old.ChangeToImmediate(Fold.ImmToFold); 124 return true; 125 } 126 127 if (Fold.isFI()) { 128 Old.ChangeToFrameIndex(Fold.FrameIndexToFold); 129 return true; 130 } 131 132 MachineOperand *New = Fold.OpToFold; 133 if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && 134 TargetRegisterInfo::isVirtualRegister(New->getReg())) { 135 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); 136 return true; 137 } 138 139 // FIXME: Handle physical registers. 140 141 return false; 142 } 143 144 static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, 145 const MachineInstr *MI) { 146 for (auto Candidate : FoldList) { 147 if (Candidate.UseMI == MI) 148 return true; 149 } 150 return false; 151 } 152 153 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, 154 MachineInstr *MI, unsigned OpNo, 155 MachineOperand *OpToFold, 156 const SIInstrInfo *TII) { 157 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { 158 159 // Special case for v_mac_f32_e64 if we are trying to fold into src2 160 unsigned Opc = MI->getOpcode(); 161 if (Opc == AMDGPU::V_MAC_F32_e64 && 162 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { 163 // Check if changing this to a v_mad_f32 instruction will allow us to 164 // fold the operand. 165 MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); 166 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); 167 if (FoldAsMAD) { 168 MI->untieRegOperand(OpNo); 169 return true; 170 } 171 MI->setDesc(TII->get(Opc)); 172 } 173 174 // If we are already folding into another operand of MI, then 175 // we can't commute the instruction, otherwise we risk making the 176 // other fold illegal. 177 if (isUseMIInFoldList(FoldList, MI)) 178 return false; 179 180 // Operand is not legal, so try to commute the instruction to 181 // see if this makes it possible to fold. 182 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; 183 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 184 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); 185 186 if (CanCommute) { 187 if (CommuteIdx0 == OpNo) 188 OpNo = CommuteIdx1; 189 else if (CommuteIdx1 == OpNo) 190 OpNo = CommuteIdx0; 191 } 192 193 // One of operands might be an Imm operand, and OpNo may refer to it after 194 // the call of commuteInstruction() below. Such situations are avoided 195 // here explicitly as OpNo must be a register operand to be a candidate 196 // for memory folding. 197 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || 198 !MI->getOperand(CommuteIdx1).isReg())) 199 return false; 200 201 if (!CanCommute || 202 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) 203 return false; 204 205 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) 206 return false; 207 } 208 209 FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); 210 return true; 211 } 212 213 // If the use operand doesn't care about the value, this may be an operand only 214 // used for register indexing, in which case it is unsafe to fold. 215 static bool isUseSafeToFold(const MachineInstr &MI, 216 const MachineOperand &UseMO) { 217 return !UseMO.isUndef(); 218 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); 219 } 220 221 static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, 222 unsigned UseOpIdx, 223 std::vector<FoldCandidate> &FoldList, 224 SmallVectorImpl<MachineInstr *> &CopiesToReplace, 225 const SIInstrInfo *TII, const SIRegisterInfo &TRI, 226 MachineRegisterInfo &MRI) { 227 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); 228 229 if (!isUseSafeToFold(*UseMI, UseOp)) 230 return; 231 232 // FIXME: Fold operands with subregs. 233 if (UseOp.isReg() && OpToFold.isReg()) { 234 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) 235 return; 236 237 // Don't fold subregister extracts into tied operands, only if it is a full 238 // copy since a subregister use tied to a full register def doesn't really 239 // make sense. e.g. don't fold: 240 // 241 // %vreg1 = COPY %vreg0:sub1 242 // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg1<tied0> 243 // 244 // into 245 // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg0:sub1<tied0> 246 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) 247 return; 248 } 249 250 bool FoldingImm = OpToFold.isImm(); 251 APInt Imm; 252 253 if (FoldingImm) { 254 unsigned UseReg = UseOp.getReg(); 255 const TargetRegisterClass *UseRC 256 = TargetRegisterInfo::isVirtualRegister(UseReg) ? 257 MRI.getRegClass(UseReg) : 258 TRI.getPhysRegClass(UseReg); 259 260 Imm = APInt(64, OpToFold.getImm()); 261 262 const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); 263 const TargetRegisterClass *FoldRC = 264 TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); 265 266 // Split 64-bit constants into 32-bits for folding. 267 if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { 268 if (UseRC->getSize() != 8) 269 return; 270 271 if (UseOp.getSubReg() == AMDGPU::sub0) { 272 Imm = Imm.getLoBits(32); 273 } else { 274 assert(UseOp.getSubReg() == AMDGPU::sub1); 275 Imm = Imm.getHiBits(32); 276 } 277 } 278 279 // In order to fold immediates into copies, we need to change the 280 // copy to a MOV. 281 if (UseMI->getOpcode() == AMDGPU::COPY) { 282 unsigned DestReg = UseMI->getOperand(0).getReg(); 283 const TargetRegisterClass *DestRC 284 = TargetRegisterInfo::isVirtualRegister(DestReg) ? 285 MRI.getRegClass(DestReg) : 286 TRI.getPhysRegClass(DestReg); 287 288 unsigned MovOp = TII->getMovOpcode(DestRC); 289 if (MovOp == AMDGPU::COPY) 290 return; 291 292 UseMI->setDesc(TII->get(MovOp)); 293 CopiesToReplace.push_back(UseMI); 294 } 295 } 296 297 // Special case for REG_SEQUENCE: We can't fold literals into 298 // REG_SEQUENCE instructions, so we have to fold them into the 299 // uses of REG_SEQUENCE. 300 if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { 301 unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); 302 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); 303 304 for (MachineRegisterInfo::use_iterator 305 RSUse = MRI.use_begin(RegSeqDstReg), 306 RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { 307 308 MachineInstr *RSUseMI = RSUse->getParent(); 309 if (RSUse->getSubReg() != RegSeqDstSubReg) 310 continue; 311 312 foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, 313 CopiesToReplace, TII, TRI, MRI); 314 } 315 return; 316 } 317 318 const MCInstrDesc &UseDesc = UseMI->getDesc(); 319 320 // Don't fold into target independent nodes. Target independent opcodes 321 // don't have defined register classes. 322 if (UseDesc.isVariadic() || 323 UseDesc.OpInfo[UseOpIdx].RegClass == -1) 324 return; 325 326 if (FoldingImm) { 327 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); 328 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); 329 return; 330 } 331 332 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); 333 334 // FIXME: We could try to change the instruction from 64-bit to 32-bit 335 // to enable more folding opportunites. The shrink operands pass 336 // already does this. 337 return; 338 } 339 340 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, 341 int32_t LHS, int32_t RHS) { 342 switch (Opcode) { 343 case AMDGPU::V_AND_B32_e64: 344 case AMDGPU::S_AND_B32: 345 Result = LHS & RHS; 346 return true; 347 case AMDGPU::V_OR_B32_e64: 348 case AMDGPU::S_OR_B32: 349 Result = LHS | RHS; 350 return true; 351 case AMDGPU::V_XOR_B32_e64: 352 case AMDGPU::S_XOR_B32: 353 Result = LHS ^ RHS; 354 return true; 355 default: 356 return false; 357 } 358 } 359 360 static unsigned getMovOpc(bool IsScalar) { 361 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 362 } 363 364 /// Remove any leftover implicit operands from mutating the instruction. e.g. 365 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def 366 /// anymore. 367 static void stripExtraCopyOperands(MachineInstr &MI) { 368 const MCInstrDesc &Desc = MI.getDesc(); 369 unsigned NumOps = Desc.getNumOperands() + 370 Desc.getNumImplicitUses() + 371 Desc.getNumImplicitDefs(); 372 373 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) 374 MI.RemoveOperand(I); 375 } 376 377 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { 378 MI.setDesc(NewDesc); 379 stripExtraCopyOperands(MI); 380 } 381 382 // Try to simplify operations with a constant that may appear after instruction 383 // selection. 384 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, 385 const SIInstrInfo *TII, 386 MachineInstr *MI) { 387 unsigned Opc = MI->getOpcode(); 388 389 if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || 390 Opc == AMDGPU::S_NOT_B32) { 391 MachineOperand &Src0 = MI->getOperand(1); 392 if (Src0.isImm()) { 393 Src0.setImm(~Src0.getImm()); 394 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); 395 return true; 396 } 397 398 return false; 399 } 400 401 if (!MI->isCommutable()) 402 return false; 403 404 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 405 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 406 407 MachineOperand *Src0 = &MI->getOperand(Src0Idx); 408 MachineOperand *Src1 = &MI->getOperand(Src1Idx); 409 if (!Src0->isImm() && !Src1->isImm()) 410 return false; 411 412 // and k0, k1 -> v_mov_b32 (k0 & k1) 413 // or k0, k1 -> v_mov_b32 (k0 | k1) 414 // xor k0, k1 -> v_mov_b32 (k0 ^ k1) 415 if (Src0->isImm() && Src1->isImm()) { 416 int32_t NewImm; 417 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) 418 return false; 419 420 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 421 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); 422 423 Src0->setImm(NewImm); 424 MI->RemoveOperand(Src1Idx); 425 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); 426 return true; 427 } 428 429 if (Src0->isImm() && !Src1->isImm()) { 430 std::swap(Src0, Src1); 431 std::swap(Src0Idx, Src1Idx); 432 } 433 434 int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); 435 if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) { 436 if (Src1Val == 0) { 437 // y = or x, 0 => y = copy x 438 MI->RemoveOperand(Src1Idx); 439 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 440 } else if (Src1Val == -1) { 441 // y = or x, -1 => y = v_mov_b32 -1 442 MI->RemoveOperand(Src1Idx); 443 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); 444 } else 445 return false; 446 447 return true; 448 } 449 450 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || 451 MI->getOpcode() == AMDGPU::S_AND_B32) { 452 if (Src1Val == 0) { 453 // y = and x, 0 => y = v_mov_b32 0 454 MI->RemoveOperand(Src0Idx); 455 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); 456 } else if (Src1Val == -1) { 457 // y = and x, -1 => y = copy x 458 MI->RemoveOperand(Src1Idx); 459 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 460 stripExtraCopyOperands(*MI); 461 } else 462 return false; 463 464 return true; 465 } 466 467 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || 468 MI->getOpcode() == AMDGPU::S_XOR_B32) { 469 if (Src1Val == 0) { 470 // y = xor x, 0 => y = copy x 471 MI->RemoveOperand(Src1Idx); 472 mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); 473 } 474 } 475 476 return false; 477 } 478 479 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { 480 if (skipFunction(*MF.getFunction())) 481 return false; 482 483 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 484 485 MachineRegisterInfo &MRI = MF.getRegInfo(); 486 const SIInstrInfo *TII = ST.getInstrInfo(); 487 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 488 489 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 490 BI != BE; ++BI) { 491 492 MachineBasicBlock &MBB = *BI; 493 MachineBasicBlock::iterator I, Next; 494 for (I = MBB.begin(); I != MBB.end(); I = Next) { 495 Next = std::next(I); 496 MachineInstr &MI = *I; 497 498 if (!isSafeToFold(MI)) 499 continue; 500 501 unsigned OpSize = TII->getOpSize(MI, 1); 502 MachineOperand &OpToFold = MI.getOperand(1); 503 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); 504 505 // FIXME: We could also be folding things like FrameIndexes and 506 // TargetIndexes. 507 if (!FoldingImm && !OpToFold.isReg()) 508 continue; 509 510 // Folding immediates with more than one use will increase program size. 511 // FIXME: This will also reduce register usage, which may be better 512 // in some cases. A better heuristic is needed. 513 if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && 514 !MRI.hasOneUse(MI.getOperand(0).getReg())) 515 continue; 516 517 if (OpToFold.isReg() && 518 !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) 519 continue; 520 521 // Prevent folding operands backwards in the function. For example, 522 // the COPY opcode must not be replaced by 1 in this example: 523 // 524 // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 525 // ... 526 // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> 527 MachineOperand &Dst = MI.getOperand(0); 528 if (Dst.isReg() && 529 !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) 530 continue; 531 532 // We need mutate the operands of new mov instructions to add implicit 533 // uses of EXEC, but adding them invalidates the use_iterator, so defer 534 // this. 535 SmallVector<MachineInstr *, 4> CopiesToReplace; 536 537 std::vector<FoldCandidate> FoldList; 538 for (MachineRegisterInfo::use_iterator 539 Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); 540 Use != E; ++Use) { 541 542 MachineInstr *UseMI = Use->getParent(); 543 544 foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, 545 CopiesToReplace, TII, TRI, MRI); 546 } 547 548 // Make sure we add EXEC uses to any new v_mov instructions created. 549 for (MachineInstr *Copy : CopiesToReplace) 550 Copy->addImplicitDefUseOperands(MF); 551 552 for (FoldCandidate &Fold : FoldList) { 553 if (updateOperand(Fold, TRI)) { 554 // Clear kill flags. 555 if (Fold.isReg()) { 556 assert(Fold.OpToFold && Fold.OpToFold->isReg()); 557 // FIXME: Probably shouldn't bother trying to fold if not an 558 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR 559 // copies. 560 MRI.clearKillFlags(Fold.OpToFold->getReg()); 561 } 562 DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << 563 Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); 564 565 // Folding the immediate may reveal operations that can be constant 566 // folded or replaced with a copy. This can happen for example after 567 // frame indices are lowered to constants or from splitting 64-bit 568 // constants. 569 tryConstantFoldOp(MRI, TII, Fold.UseMI); 570 } 571 } 572 } 573 } 574 return false; 575 } 576