1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/ScheduleDAG.h" 24 #include "llvm/IR/Function.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/Debug.h" 28 29 using namespace llvm; 30 31 // Must be at least 4 to be able to branch over minimum unconditional branch 32 // code. This is only for making it possible to write reasonably small tests for 33 // long branches. 34 static cl::opt<unsigned> 35 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 36 cl::desc("Restrict range of branch instructions (DEBUG)")); 37 38 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 39 : AMDGPUInstrInfo(ST), RI(), ST(ST) {} 40 41 //===----------------------------------------------------------------------===// 42 // TargetInstrInfo callbacks 43 //===----------------------------------------------------------------------===// 44 45 static unsigned getNumOperandsNoGlue(SDNode *Node) { 46 unsigned N = Node->getNumOperands(); 47 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 48 --N; 49 return N; 50 } 51 52 static SDValue findChainOperand(SDNode *Load) { 53 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 54 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 55 return LastOp; 56 } 57 58 /// \brief Returns true if both nodes have the same value for the given 59 /// operand \p Op, or if both nodes do not have this operand. 60 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 61 unsigned Opc0 = N0->getMachineOpcode(); 62 unsigned Opc1 = N1->getMachineOpcode(); 63 64 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 65 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 66 67 if (Op0Idx == -1 && Op1Idx == -1) 68 return true; 69 70 71 if ((Op0Idx == -1 && Op1Idx != -1) || 72 (Op1Idx == -1 && Op0Idx != -1)) 73 return false; 74 75 // getNamedOperandIdx returns the index for the MachineInstr's operands, 76 // which includes the result as the first operand. We are indexing into the 77 // MachineSDNode's operands, so we need to skip the result operand to get 78 // the real index. 79 --Op0Idx; 80 --Op1Idx; 81 82 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 83 } 84 85 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 86 AliasAnalysis *AA) const { 87 // TODO: The generic check fails for VALU instructions that should be 88 // rematerializable due to implicit reads of exec. We really want all of the 89 // generic logic for this except for this. 90 switch (MI.getOpcode()) { 91 case AMDGPU::V_MOV_B32_e32: 92 case AMDGPU::V_MOV_B32_e64: 93 case AMDGPU::V_MOV_B64_PSEUDO: 94 return true; 95 default: 96 return false; 97 } 98 } 99 100 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 101 int64_t &Offset0, 102 int64_t &Offset1) const { 103 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 104 return false; 105 106 unsigned Opc0 = Load0->getMachineOpcode(); 107 unsigned Opc1 = Load1->getMachineOpcode(); 108 109 // Make sure both are actually loads. 110 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 111 return false; 112 113 if (isDS(Opc0) && isDS(Opc1)) { 114 115 // FIXME: Handle this case: 116 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 117 return false; 118 119 // Check base reg. 120 if (Load0->getOperand(1) != Load1->getOperand(1)) 121 return false; 122 123 // Check chain. 124 if (findChainOperand(Load0) != findChainOperand(Load1)) 125 return false; 126 127 // Skip read2 / write2 variants for simplicity. 128 // TODO: We should report true if the used offsets are adjacent (excluded 129 // st64 versions). 130 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 131 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 132 return false; 133 134 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 135 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 136 return true; 137 } 138 139 if (isSMRD(Opc0) && isSMRD(Opc1)) { 140 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 141 142 // Check base reg. 143 if (Load0->getOperand(0) != Load1->getOperand(0)) 144 return false; 145 146 const ConstantSDNode *Load0Offset = 147 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 148 const ConstantSDNode *Load1Offset = 149 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 150 151 if (!Load0Offset || !Load1Offset) 152 return false; 153 154 // Check chain. 155 if (findChainOperand(Load0) != findChainOperand(Load1)) 156 return false; 157 158 Offset0 = Load0Offset->getZExtValue(); 159 Offset1 = Load1Offset->getZExtValue(); 160 return true; 161 } 162 163 // MUBUF and MTBUF can access the same addresses. 164 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 165 166 // MUBUF and MTBUF have vaddr at different indices. 167 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 168 findChainOperand(Load0) != findChainOperand(Load1) || 169 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 170 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 171 return false; 172 173 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 174 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 175 176 if (OffIdx0 == -1 || OffIdx1 == -1) 177 return false; 178 179 // getNamedOperandIdx returns the index for MachineInstrs. Since they 180 // inlcude the output in the operand list, but SDNodes don't, we need to 181 // subtract the index by one. 182 --OffIdx0; 183 --OffIdx1; 184 185 SDValue Off0 = Load0->getOperand(OffIdx0); 186 SDValue Off1 = Load1->getOperand(OffIdx1); 187 188 // The offset might be a FrameIndexSDNode. 189 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 190 return false; 191 192 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 193 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 194 return true; 195 } 196 197 return false; 198 } 199 200 static bool isStride64(unsigned Opc) { 201 switch (Opc) { 202 case AMDGPU::DS_READ2ST64_B32: 203 case AMDGPU::DS_READ2ST64_B64: 204 case AMDGPU::DS_WRITE2ST64_B32: 205 case AMDGPU::DS_WRITE2ST64_B64: 206 return true; 207 default: 208 return false; 209 } 210 } 211 212 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 213 int64_t &Offset, 214 const TargetRegisterInfo *TRI) const { 215 unsigned Opc = LdSt.getOpcode(); 216 217 if (isDS(LdSt)) { 218 const MachineOperand *OffsetImm = 219 getNamedOperand(LdSt, AMDGPU::OpName::offset); 220 if (OffsetImm) { 221 // Normal, single offset LDS instruction. 222 const MachineOperand *AddrReg = 223 getNamedOperand(LdSt, AMDGPU::OpName::addr); 224 225 BaseReg = AddrReg->getReg(); 226 Offset = OffsetImm->getImm(); 227 return true; 228 } 229 230 // The 2 offset instructions use offset0 and offset1 instead. We can treat 231 // these as a load with a single offset if the 2 offsets are consecutive. We 232 // will use this for some partially aligned loads. 233 const MachineOperand *Offset0Imm = 234 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 235 const MachineOperand *Offset1Imm = 236 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 237 238 uint8_t Offset0 = Offset0Imm->getImm(); 239 uint8_t Offset1 = Offset1Imm->getImm(); 240 241 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 242 // Each of these offsets is in element sized units, so we need to convert 243 // to bytes of the individual reads. 244 245 unsigned EltSize; 246 if (LdSt.mayLoad()) 247 EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; 248 else { 249 assert(LdSt.mayStore()); 250 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 251 EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); 252 } 253 254 if (isStride64(Opc)) 255 EltSize *= 64; 256 257 const MachineOperand *AddrReg = 258 getNamedOperand(LdSt, AMDGPU::OpName::addr); 259 BaseReg = AddrReg->getReg(); 260 Offset = EltSize * Offset0; 261 return true; 262 } 263 264 return false; 265 } 266 267 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 268 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 269 if (SOffset && SOffset->isReg()) 270 return false; 271 272 const MachineOperand *AddrReg = 273 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 274 if (!AddrReg) 275 return false; 276 277 const MachineOperand *OffsetImm = 278 getNamedOperand(LdSt, AMDGPU::OpName::offset); 279 BaseReg = AddrReg->getReg(); 280 Offset = OffsetImm->getImm(); 281 282 if (SOffset) // soffset can be an inline immediate. 283 Offset += SOffset->getImm(); 284 285 return true; 286 } 287 288 if (isSMRD(LdSt)) { 289 const MachineOperand *OffsetImm = 290 getNamedOperand(LdSt, AMDGPU::OpName::offset); 291 if (!OffsetImm) 292 return false; 293 294 const MachineOperand *SBaseReg = 295 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 296 BaseReg = SBaseReg->getReg(); 297 Offset = OffsetImm->getImm(); 298 return true; 299 } 300 301 if (isFLAT(LdSt)) { 302 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 303 BaseReg = AddrReg->getReg(); 304 Offset = 0; 305 return true; 306 } 307 308 return false; 309 } 310 311 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 312 MachineInstr &SecondLdSt, 313 unsigned NumLoads) const { 314 const MachineOperand *FirstDst = nullptr; 315 const MachineOperand *SecondDst = nullptr; 316 317 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 318 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 319 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 320 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 321 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 322 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 323 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 324 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 325 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 326 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 327 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 328 } 329 330 if (!FirstDst || !SecondDst) 331 return false; 332 333 // Try to limit clustering based on the total number of bytes loaded 334 // rather than the number of instructions. This is done to help reduce 335 // register pressure. The method used is somewhat inexact, though, 336 // because it assumes that all loads in the cluster will load the 337 // same number of bytes as FirstLdSt. 338 339 // The unit of this value is bytes. 340 // FIXME: This needs finer tuning. 341 unsigned LoadClusterThreshold = 16; 342 343 const MachineRegisterInfo &MRI = 344 FirstLdSt.getParent()->getParent()->getRegInfo(); 345 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 346 347 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 348 } 349 350 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 351 MachineBasicBlock::iterator MI, 352 const DebugLoc &DL, unsigned DestReg, 353 unsigned SrcReg, bool KillSrc) const { 354 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 355 356 if (RC == &AMDGPU::VGPR_32RegClass) { 357 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 358 AMDGPU::SReg_32RegClass.contains(SrcReg)); 359 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 360 .addReg(SrcReg, getKillRegState(KillSrc)); 361 return; 362 } 363 364 if (RC == &AMDGPU::SReg_32_XM0RegClass || 365 RC == &AMDGPU::SReg_32RegClass) { 366 if (SrcReg == AMDGPU::SCC) { 367 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 368 .addImm(-1) 369 .addImm(0); 370 return; 371 } 372 373 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 374 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 375 .addReg(SrcReg, getKillRegState(KillSrc)); 376 return; 377 } 378 379 if (RC == &AMDGPU::SReg_64RegClass) { 380 if (DestReg == AMDGPU::VCC) { 381 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 382 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 383 .addReg(SrcReg, getKillRegState(KillSrc)); 384 } else { 385 // FIXME: Hack until VReg_1 removed. 386 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 387 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 388 .addImm(0) 389 .addReg(SrcReg, getKillRegState(KillSrc)); 390 } 391 392 return; 393 } 394 395 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 396 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 397 .addReg(SrcReg, getKillRegState(KillSrc)); 398 return; 399 } 400 401 if (DestReg == AMDGPU::SCC) { 402 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 403 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 404 .addReg(SrcReg, getKillRegState(KillSrc)) 405 .addImm(0); 406 return; 407 } 408 409 unsigned EltSize = 4; 410 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 411 if (RI.isSGPRClass(RC)) { 412 if (RC->getSize() > 4) { 413 Opcode = AMDGPU::S_MOV_B64; 414 EltSize = 8; 415 } else { 416 Opcode = AMDGPU::S_MOV_B32; 417 EltSize = 4; 418 } 419 } 420 421 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 422 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 423 424 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 425 unsigned SubIdx; 426 if (Forward) 427 SubIdx = SubIndices[Idx]; 428 else 429 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 430 431 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 432 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 433 434 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 435 436 if (Idx == SubIndices.size() - 1) 437 Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 438 439 if (Idx == 0) 440 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 441 442 Builder.addReg(SrcReg, RegState::Implicit); 443 } 444 } 445 446 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 447 int NewOpc; 448 449 // Try to map original to commuted opcode 450 NewOpc = AMDGPU::getCommuteRev(Opcode); 451 if (NewOpc != -1) 452 // Check if the commuted (REV) opcode exists on the target. 453 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 454 455 // Try to map commuted to original opcode 456 NewOpc = AMDGPU::getCommuteOrig(Opcode); 457 if (NewOpc != -1) 458 // Check if the original (non-REV) opcode exists on the target. 459 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 460 461 return Opcode; 462 } 463 464 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 465 466 if (DstRC->getSize() == 4) { 467 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 468 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 469 return AMDGPU::S_MOV_B64; 470 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 471 return AMDGPU::V_MOV_B64_PSEUDO; 472 } 473 return AMDGPU::COPY; 474 } 475 476 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 477 switch (Size) { 478 case 4: 479 return AMDGPU::SI_SPILL_S32_SAVE; 480 case 8: 481 return AMDGPU::SI_SPILL_S64_SAVE; 482 case 16: 483 return AMDGPU::SI_SPILL_S128_SAVE; 484 case 32: 485 return AMDGPU::SI_SPILL_S256_SAVE; 486 case 64: 487 return AMDGPU::SI_SPILL_S512_SAVE; 488 default: 489 llvm_unreachable("unknown register size"); 490 } 491 } 492 493 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 494 switch (Size) { 495 case 4: 496 return AMDGPU::SI_SPILL_V32_SAVE; 497 case 8: 498 return AMDGPU::SI_SPILL_V64_SAVE; 499 case 12: 500 return AMDGPU::SI_SPILL_V96_SAVE; 501 case 16: 502 return AMDGPU::SI_SPILL_V128_SAVE; 503 case 32: 504 return AMDGPU::SI_SPILL_V256_SAVE; 505 case 64: 506 return AMDGPU::SI_SPILL_V512_SAVE; 507 default: 508 llvm_unreachable("unknown register size"); 509 } 510 } 511 512 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 513 MachineBasicBlock::iterator MI, 514 unsigned SrcReg, bool isKill, 515 int FrameIndex, 516 const TargetRegisterClass *RC, 517 const TargetRegisterInfo *TRI) const { 518 MachineFunction *MF = MBB.getParent(); 519 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 520 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 521 DebugLoc DL = MBB.findDebugLoc(MI); 522 523 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 524 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 525 MachinePointerInfo PtrInfo 526 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 527 MachineMemOperand *MMO 528 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 529 Size, Align); 530 531 if (RI.isSGPRClass(RC)) { 532 MFI->setHasSpilledSGPRs(); 533 534 // We are only allowed to create one new instruction when spilling 535 // registers, so we need to use pseudo instruction for spilling SGPRs. 536 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); 537 538 // The SGPR spill/restore instructions only work on number sgprs, so we need 539 // to make sure we are using the correct register class. 540 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { 541 MachineRegisterInfo &MRI = MF->getRegInfo(); 542 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 543 } 544 545 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 546 .addReg(SrcReg, getKillRegState(isKill)) // data 547 .addFrameIndex(FrameIndex) // addr 548 .addMemOperand(MMO) 549 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 550 .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); 551 // Add the scratch resource registers as implicit uses because we may end up 552 // needing them, and need to ensure that the reserved registers are 553 // correctly handled. 554 555 if (ST.hasScalarStores()) { 556 // m0 is used for offset to scalar stores if used to spill. 557 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 558 } 559 560 return; 561 } 562 563 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 564 LLVMContext &Ctx = MF->getFunction()->getContext(); 565 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 566 " spill register"); 567 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 568 .addReg(SrcReg); 569 570 return; 571 } 572 573 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 574 575 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 576 MFI->setHasSpilledVGPRs(); 577 BuildMI(MBB, MI, DL, get(Opcode)) 578 .addReg(SrcReg, getKillRegState(isKill)) // data 579 .addFrameIndex(FrameIndex) // addr 580 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 581 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 582 .addImm(0) // offset 583 .addMemOperand(MMO); 584 } 585 586 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 587 switch (Size) { 588 case 4: 589 return AMDGPU::SI_SPILL_S32_RESTORE; 590 case 8: 591 return AMDGPU::SI_SPILL_S64_RESTORE; 592 case 16: 593 return AMDGPU::SI_SPILL_S128_RESTORE; 594 case 32: 595 return AMDGPU::SI_SPILL_S256_RESTORE; 596 case 64: 597 return AMDGPU::SI_SPILL_S512_RESTORE; 598 default: 599 llvm_unreachable("unknown register size"); 600 } 601 } 602 603 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 604 switch (Size) { 605 case 4: 606 return AMDGPU::SI_SPILL_V32_RESTORE; 607 case 8: 608 return AMDGPU::SI_SPILL_V64_RESTORE; 609 case 12: 610 return AMDGPU::SI_SPILL_V96_RESTORE; 611 case 16: 612 return AMDGPU::SI_SPILL_V128_RESTORE; 613 case 32: 614 return AMDGPU::SI_SPILL_V256_RESTORE; 615 case 64: 616 return AMDGPU::SI_SPILL_V512_RESTORE; 617 default: 618 llvm_unreachable("unknown register size"); 619 } 620 } 621 622 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 623 MachineBasicBlock::iterator MI, 624 unsigned DestReg, int FrameIndex, 625 const TargetRegisterClass *RC, 626 const TargetRegisterInfo *TRI) const { 627 MachineFunction *MF = MBB.getParent(); 628 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 629 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 630 DebugLoc DL = MBB.findDebugLoc(MI); 631 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 632 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 633 634 MachinePointerInfo PtrInfo 635 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 636 637 MachineMemOperand *MMO = MF->getMachineMemOperand( 638 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 639 640 if (RI.isSGPRClass(RC)) { 641 // FIXME: Maybe this should not include a memoperand because it will be 642 // lowered to non-memory instructions. 643 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); 644 if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { 645 MachineRegisterInfo &MRI = MF->getRegInfo(); 646 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 647 } 648 649 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 650 .addFrameIndex(FrameIndex) // addr 651 .addMemOperand(MMO) 652 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 653 .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); 654 655 if (ST.hasScalarStores()) { 656 // m0 is used for offset to scalar stores if used to spill. 657 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 658 } 659 660 return; 661 } 662 663 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 664 LLVMContext &Ctx = MF->getFunction()->getContext(); 665 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 666 " restore register"); 667 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 668 669 return; 670 } 671 672 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 673 674 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 675 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 676 .addFrameIndex(FrameIndex) // vaddr 677 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 678 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 679 .addImm(0) // offset 680 .addMemOperand(MMO); 681 } 682 683 /// \param @Offset Offset in bytes of the FrameIndex being spilled 684 unsigned SIInstrInfo::calculateLDSSpillAddress( 685 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 686 unsigned FrameOffset, unsigned Size) const { 687 MachineFunction *MF = MBB.getParent(); 688 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 689 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 690 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 691 DebugLoc DL = MBB.findDebugLoc(MI); 692 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 693 unsigned WavefrontSize = ST.getWavefrontSize(); 694 695 unsigned TIDReg = MFI->getTIDReg(); 696 if (!MFI->hasCalculatedTID()) { 697 MachineBasicBlock &Entry = MBB.getParent()->front(); 698 MachineBasicBlock::iterator Insert = Entry.front(); 699 DebugLoc DL = Insert->getDebugLoc(); 700 701 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 702 *MF); 703 if (TIDReg == AMDGPU::NoRegister) 704 return TIDReg; 705 706 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 707 WorkGroupSize > WavefrontSize) { 708 709 unsigned TIDIGXReg 710 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 711 unsigned TIDIGYReg 712 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 713 unsigned TIDIGZReg 714 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 715 unsigned InputPtrReg = 716 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 717 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 718 if (!Entry.isLiveIn(Reg)) 719 Entry.addLiveIn(Reg); 720 } 721 722 RS->enterBasicBlock(Entry); 723 // FIXME: Can we scavenge an SReg_64 and access the subregs? 724 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 725 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 726 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 727 .addReg(InputPtrReg) 728 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 729 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 730 .addReg(InputPtrReg) 731 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 732 733 // NGROUPS.X * NGROUPS.Y 734 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 735 .addReg(STmp1) 736 .addReg(STmp0); 737 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 738 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 739 .addReg(STmp1) 740 .addReg(TIDIGXReg); 741 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 742 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 743 .addReg(STmp0) 744 .addReg(TIDIGYReg) 745 .addReg(TIDReg); 746 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 747 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 748 .addReg(TIDReg) 749 .addReg(TIDIGZReg); 750 } else { 751 // Get the wave id 752 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 753 TIDReg) 754 .addImm(-1) 755 .addImm(0); 756 757 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 758 TIDReg) 759 .addImm(-1) 760 .addReg(TIDReg); 761 } 762 763 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 764 TIDReg) 765 .addImm(2) 766 .addReg(TIDReg); 767 MFI->setTIDReg(TIDReg); 768 } 769 770 // Add FrameIndex to LDS offset 771 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 772 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 773 .addImm(LDSOffset) 774 .addReg(TIDReg); 775 776 return TmpReg; 777 } 778 779 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 780 MachineBasicBlock::iterator MI, 781 int Count) const { 782 DebugLoc DL = MBB.findDebugLoc(MI); 783 while (Count > 0) { 784 int Arg; 785 if (Count >= 8) 786 Arg = 7; 787 else 788 Arg = Count - 1; 789 Count -= 8; 790 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 791 .addImm(Arg); 792 } 793 } 794 795 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 796 MachineBasicBlock::iterator MI) const { 797 insertWaitStates(MBB, MI, 1); 798 } 799 800 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 801 switch (MI.getOpcode()) { 802 default: return 1; // FIXME: Do wait states equal cycles? 803 804 case AMDGPU::S_NOP: 805 return MI.getOperand(0).getImm() + 1; 806 } 807 } 808 809 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 810 MachineBasicBlock &MBB = *MI.getParent(); 811 DebugLoc DL = MBB.findDebugLoc(MI); 812 switch (MI.getOpcode()) { 813 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 814 case AMDGPU::S_MOV_B64_term: { 815 // This is only a terminator to get the correct spill code placement during 816 // register allocation. 817 MI.setDesc(get(AMDGPU::S_MOV_B64)); 818 break; 819 } 820 case AMDGPU::S_XOR_B64_term: { 821 // This is only a terminator to get the correct spill code placement during 822 // register allocation. 823 MI.setDesc(get(AMDGPU::S_XOR_B64)); 824 break; 825 } 826 case AMDGPU::S_ANDN2_B64_term: { 827 // This is only a terminator to get the correct spill code placement during 828 // register allocation. 829 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 830 break; 831 } 832 case AMDGPU::V_MOV_B64_PSEUDO: { 833 unsigned Dst = MI.getOperand(0).getReg(); 834 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 835 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 836 837 const MachineOperand &SrcOp = MI.getOperand(1); 838 // FIXME: Will this work for 64-bit floating point immediates? 839 assert(!SrcOp.isFPImm()); 840 if (SrcOp.isImm()) { 841 APInt Imm(64, SrcOp.getImm()); 842 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 843 .addImm(Imm.getLoBits(32).getZExtValue()) 844 .addReg(Dst, RegState::Implicit | RegState::Define); 845 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 846 .addImm(Imm.getHiBits(32).getZExtValue()) 847 .addReg(Dst, RegState::Implicit | RegState::Define); 848 } else { 849 assert(SrcOp.isReg()); 850 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 851 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 852 .addReg(Dst, RegState::Implicit | RegState::Define); 853 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 854 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 855 .addReg(Dst, RegState::Implicit | RegState::Define); 856 } 857 MI.eraseFromParent(); 858 break; 859 } 860 case AMDGPU::V_MOVRELD_B32_V1: 861 case AMDGPU::V_MOVRELD_B32_V2: 862 case AMDGPU::V_MOVRELD_B32_V4: 863 case AMDGPU::V_MOVRELD_B32_V8: 864 case AMDGPU::V_MOVRELD_B32_V16: { 865 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 866 unsigned VecReg = MI.getOperand(0).getReg(); 867 bool IsUndef = MI.getOperand(1).isUndef(); 868 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 869 assert(VecReg == MI.getOperand(1).getReg()); 870 871 MachineInstr *MovRel = 872 BuildMI(MBB, MI, DL, MovRelDesc) 873 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 874 .add(MI.getOperand(2)) 875 .addReg(VecReg, RegState::ImplicitDefine) 876 .addReg(VecReg, 877 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 878 879 const int ImpDefIdx = 880 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 881 const int ImpUseIdx = ImpDefIdx + 1; 882 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 883 884 MI.eraseFromParent(); 885 break; 886 } 887 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 888 MachineFunction &MF = *MBB.getParent(); 889 unsigned Reg = MI.getOperand(0).getReg(); 890 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 891 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 892 893 // Create a bundle so these instructions won't be re-ordered by the 894 // post-RA scheduler. 895 MIBundleBuilder Bundler(MBB, MI); 896 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 897 898 // Add 32-bit offset from this instruction to the start of the 899 // constant data. 900 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 901 .addReg(RegLo) 902 .add(MI.getOperand(1))); 903 904 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 905 .addReg(RegHi); 906 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 907 MIB.addImm(0); 908 else 909 MIB.add(MI.getOperand(2)); 910 911 Bundler.append(MIB); 912 llvm::finalizeBundle(MBB, Bundler.begin()); 913 914 MI.eraseFromParent(); 915 break; 916 } 917 } 918 return true; 919 } 920 921 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 922 MachineOperand &Src0, 923 unsigned Src0OpName, 924 MachineOperand &Src1, 925 unsigned Src1OpName) const { 926 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 927 if (!Src0Mods) 928 return false; 929 930 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 931 assert(Src1Mods && 932 "All commutable instructions have both src0 and src1 modifiers"); 933 934 int Src0ModsVal = Src0Mods->getImm(); 935 int Src1ModsVal = Src1Mods->getImm(); 936 937 Src1Mods->setImm(Src0ModsVal); 938 Src0Mods->setImm(Src1ModsVal); 939 return true; 940 } 941 942 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 943 MachineOperand &RegOp, 944 MachineOperand &NonRegOp) { 945 unsigned Reg = RegOp.getReg(); 946 unsigned SubReg = RegOp.getSubReg(); 947 bool IsKill = RegOp.isKill(); 948 bool IsDead = RegOp.isDead(); 949 bool IsUndef = RegOp.isUndef(); 950 bool IsDebug = RegOp.isDebug(); 951 952 if (NonRegOp.isImm()) 953 RegOp.ChangeToImmediate(NonRegOp.getImm()); 954 else if (NonRegOp.isFI()) 955 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 956 else 957 return nullptr; 958 959 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 960 NonRegOp.setSubReg(SubReg); 961 962 return &MI; 963 } 964 965 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 966 unsigned Src0Idx, 967 unsigned Src1Idx) const { 968 assert(!NewMI && "this should never be used"); 969 970 unsigned Opc = MI.getOpcode(); 971 int CommutedOpcode = commuteOpcode(Opc); 972 if (CommutedOpcode == -1) 973 return nullptr; 974 975 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 976 static_cast<int>(Src0Idx) && 977 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 978 static_cast<int>(Src1Idx) && 979 "inconsistency with findCommutedOpIndices"); 980 981 MachineOperand &Src0 = MI.getOperand(Src0Idx); 982 MachineOperand &Src1 = MI.getOperand(Src1Idx); 983 984 MachineInstr *CommutedMI = nullptr; 985 if (Src0.isReg() && Src1.isReg()) { 986 if (isOperandLegal(MI, Src1Idx, &Src0)) { 987 // Be sure to copy the source modifiers to the right place. 988 CommutedMI 989 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 990 } 991 992 } else if (Src0.isReg() && !Src1.isReg()) { 993 // src0 should always be able to support any operand type, so no need to 994 // check operand legality. 995 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 996 } else if (!Src0.isReg() && Src1.isReg()) { 997 if (isOperandLegal(MI, Src1Idx, &Src0)) 998 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 999 } else { 1000 // FIXME: Found two non registers to commute. This does happen. 1001 return nullptr; 1002 } 1003 1004 1005 if (CommutedMI) { 1006 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1007 Src1, AMDGPU::OpName::src1_modifiers); 1008 1009 CommutedMI->setDesc(get(CommutedOpcode)); 1010 } 1011 1012 return CommutedMI; 1013 } 1014 1015 // This needs to be implemented because the source modifiers may be inserted 1016 // between the true commutable operands, and the base 1017 // TargetInstrInfo::commuteInstruction uses it. 1018 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1019 unsigned &SrcOpIdx1) const { 1020 if (!MI.isCommutable()) 1021 return false; 1022 1023 unsigned Opc = MI.getOpcode(); 1024 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1025 if (Src0Idx == -1) 1026 return false; 1027 1028 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1029 if (Src1Idx == -1) 1030 return false; 1031 1032 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1033 } 1034 1035 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1036 int64_t BrOffset) const { 1037 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1038 // block is unanalyzable. 1039 assert(BranchOp != AMDGPU::S_SETPC_B64); 1040 1041 // Convert to dwords. 1042 BrOffset /= 4; 1043 1044 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1045 // from the next instruction. 1046 BrOffset -= 1; 1047 1048 return isIntN(BranchOffsetBits, BrOffset); 1049 } 1050 1051 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1052 const MachineInstr &MI) const { 1053 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1054 // This would be a difficult analysis to perform, but can always be legal so 1055 // there's no need to analyze it. 1056 return nullptr; 1057 } 1058 1059 return MI.getOperand(0).getMBB(); 1060 } 1061 1062 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1063 MachineBasicBlock &DestBB, 1064 const DebugLoc &DL, 1065 int64_t BrOffset, 1066 RegScavenger *RS) const { 1067 assert(RS && "RegScavenger required for long branching"); 1068 assert(MBB.empty() && 1069 "new block should be inserted for expanding unconditional branch"); 1070 assert(MBB.pred_size() == 1); 1071 1072 MachineFunction *MF = MBB.getParent(); 1073 MachineRegisterInfo &MRI = MF->getRegInfo(); 1074 1075 // FIXME: Virtual register workaround for RegScavenger not working with empty 1076 // blocks. 1077 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1078 1079 auto I = MBB.end(); 1080 1081 // We need to compute the offset relative to the instruction immediately after 1082 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1083 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1084 1085 // TODO: Handle > 32-bit block address. 1086 if (BrOffset >= 0) { 1087 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1088 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1089 .addReg(PCReg, 0, AMDGPU::sub0) 1090 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1091 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1092 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1093 .addReg(PCReg, 0, AMDGPU::sub1) 1094 .addImm(0); 1095 } else { 1096 // Backwards branch. 1097 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1098 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1099 .addReg(PCReg, 0, AMDGPU::sub0) 1100 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1101 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1102 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1103 .addReg(PCReg, 0, AMDGPU::sub1) 1104 .addImm(0); 1105 } 1106 1107 // Insert the indirect branch after the other terminator. 1108 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1109 .addReg(PCReg); 1110 1111 // FIXME: If spilling is necessary, this will fail because this scavenger has 1112 // no emergency stack slots. It is non-trivial to spill in this situation, 1113 // because the restore code needs to be specially placed after the 1114 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1115 // block. 1116 // 1117 // If a spill is needed for the pc register pair, we need to insert a spill 1118 // restore block right before the destination block, and insert a short branch 1119 // into the old destination block's fallthrough predecessor. 1120 // e.g.: 1121 // 1122 // s_cbranch_scc0 skip_long_branch: 1123 // 1124 // long_branch_bb: 1125 // spill s[8:9] 1126 // s_getpc_b64 s[8:9] 1127 // s_add_u32 s8, s8, restore_bb 1128 // s_addc_u32 s9, s9, 0 1129 // s_setpc_b64 s[8:9] 1130 // 1131 // skip_long_branch: 1132 // foo; 1133 // 1134 // ..... 1135 // 1136 // dest_bb_fallthrough_predecessor: 1137 // bar; 1138 // s_branch dest_bb 1139 // 1140 // restore_bb: 1141 // restore s[8:9] 1142 // fallthrough dest_bb 1143 /// 1144 // dest_bb: 1145 // buzz; 1146 1147 RS->enterBasicBlockEnd(MBB); 1148 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1149 MachineBasicBlock::iterator(GetPC), 0); 1150 MRI.replaceRegWith(PCReg, Scav); 1151 MRI.clearVirtRegs(); 1152 RS->setRegUsed(Scav); 1153 1154 return 4 + 8 + 4 + 4; 1155 } 1156 1157 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1158 switch (Cond) { 1159 case SIInstrInfo::SCC_TRUE: 1160 return AMDGPU::S_CBRANCH_SCC1; 1161 case SIInstrInfo::SCC_FALSE: 1162 return AMDGPU::S_CBRANCH_SCC0; 1163 case SIInstrInfo::VCCNZ: 1164 return AMDGPU::S_CBRANCH_VCCNZ; 1165 case SIInstrInfo::VCCZ: 1166 return AMDGPU::S_CBRANCH_VCCZ; 1167 case SIInstrInfo::EXECNZ: 1168 return AMDGPU::S_CBRANCH_EXECNZ; 1169 case SIInstrInfo::EXECZ: 1170 return AMDGPU::S_CBRANCH_EXECZ; 1171 default: 1172 llvm_unreachable("invalid branch predicate"); 1173 } 1174 } 1175 1176 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1177 switch (Opcode) { 1178 case AMDGPU::S_CBRANCH_SCC0: 1179 return SCC_FALSE; 1180 case AMDGPU::S_CBRANCH_SCC1: 1181 return SCC_TRUE; 1182 case AMDGPU::S_CBRANCH_VCCNZ: 1183 return VCCNZ; 1184 case AMDGPU::S_CBRANCH_VCCZ: 1185 return VCCZ; 1186 case AMDGPU::S_CBRANCH_EXECNZ: 1187 return EXECNZ; 1188 case AMDGPU::S_CBRANCH_EXECZ: 1189 return EXECZ; 1190 default: 1191 return INVALID_BR; 1192 } 1193 } 1194 1195 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1196 MachineBasicBlock::iterator I, 1197 MachineBasicBlock *&TBB, 1198 MachineBasicBlock *&FBB, 1199 SmallVectorImpl<MachineOperand> &Cond, 1200 bool AllowModify) const { 1201 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1202 // Unconditional Branch 1203 TBB = I->getOperand(0).getMBB(); 1204 return false; 1205 } 1206 1207 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1208 if (Pred == INVALID_BR) 1209 return true; 1210 1211 MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); 1212 Cond.push_back(MachineOperand::CreateImm(Pred)); 1213 Cond.push_back(I->getOperand(1)); // Save the branch register. 1214 1215 ++I; 1216 1217 if (I == MBB.end()) { 1218 // Conditional branch followed by fall-through. 1219 TBB = CondBB; 1220 return false; 1221 } 1222 1223 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1224 TBB = CondBB; 1225 FBB = I->getOperand(0).getMBB(); 1226 return false; 1227 } 1228 1229 return true; 1230 } 1231 1232 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1233 MachineBasicBlock *&FBB, 1234 SmallVectorImpl<MachineOperand> &Cond, 1235 bool AllowModify) const { 1236 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1237 if (I == MBB.end()) 1238 return false; 1239 1240 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1241 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1242 1243 ++I; 1244 1245 // TODO: Should be able to treat as fallthrough? 1246 if (I == MBB.end()) 1247 return true; 1248 1249 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1250 return true; 1251 1252 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1253 1254 // Specifically handle the case where the conditional branch is to the same 1255 // destination as the mask branch. e.g. 1256 // 1257 // si_mask_branch BB8 1258 // s_cbranch_execz BB8 1259 // s_cbranch BB9 1260 // 1261 // This is required to understand divergent loops which may need the branches 1262 // to be relaxed. 1263 if (TBB != MaskBrDest || Cond.empty()) 1264 return true; 1265 1266 auto Pred = Cond[0].getImm(); 1267 return (Pred != EXECZ && Pred != EXECNZ); 1268 } 1269 1270 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1271 int *BytesRemoved) const { 1272 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1273 1274 unsigned Count = 0; 1275 unsigned RemovedSize = 0; 1276 while (I != MBB.end()) { 1277 MachineBasicBlock::iterator Next = std::next(I); 1278 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1279 I = Next; 1280 continue; 1281 } 1282 1283 RemovedSize += getInstSizeInBytes(*I); 1284 I->eraseFromParent(); 1285 ++Count; 1286 I = Next; 1287 } 1288 1289 if (BytesRemoved) 1290 *BytesRemoved = RemovedSize; 1291 1292 return Count; 1293 } 1294 1295 // Copy the flags onto the implicit condition register operand. 1296 static void preserveCondRegFlags(MachineOperand &CondReg, 1297 const MachineOperand &OrigCond) { 1298 CondReg.setIsUndef(OrigCond.isUndef()); 1299 CondReg.setIsKill(OrigCond.isKill()); 1300 } 1301 1302 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1303 MachineBasicBlock *TBB, 1304 MachineBasicBlock *FBB, 1305 ArrayRef<MachineOperand> Cond, 1306 const DebugLoc &DL, 1307 int *BytesAdded) const { 1308 1309 if (!FBB && Cond.empty()) { 1310 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1311 .addMBB(TBB); 1312 if (BytesAdded) 1313 *BytesAdded = 4; 1314 return 1; 1315 } 1316 1317 assert(TBB && Cond[0].isImm()); 1318 1319 unsigned Opcode 1320 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1321 1322 if (!FBB) { 1323 Cond[1].isUndef(); 1324 MachineInstr *CondBr = 1325 BuildMI(&MBB, DL, get(Opcode)) 1326 .addMBB(TBB); 1327 1328 // Copy the flags onto the implicit condition register operand. 1329 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 1330 1331 if (BytesAdded) 1332 *BytesAdded = 4; 1333 return 1; 1334 } 1335 1336 assert(TBB && FBB); 1337 1338 MachineInstr *CondBr = 1339 BuildMI(&MBB, DL, get(Opcode)) 1340 .addMBB(TBB); 1341 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1342 .addMBB(FBB); 1343 1344 MachineOperand &CondReg = CondBr->getOperand(1); 1345 CondReg.setIsUndef(Cond[1].isUndef()); 1346 CondReg.setIsKill(Cond[1].isKill()); 1347 1348 if (BytesAdded) 1349 *BytesAdded = 8; 1350 1351 return 2; 1352 } 1353 1354 bool SIInstrInfo::reverseBranchCondition( 1355 SmallVectorImpl<MachineOperand> &Cond) const { 1356 assert(Cond.size() == 2); 1357 Cond[0].setImm(-Cond[0].getImm()); 1358 return false; 1359 } 1360 1361 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 1362 ArrayRef<MachineOperand> Cond, 1363 unsigned TrueReg, unsigned FalseReg, 1364 int &CondCycles, 1365 int &TrueCycles, int &FalseCycles) const { 1366 switch (Cond[0].getImm()) { 1367 case VCCNZ: 1368 case VCCZ: { 1369 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1370 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1371 assert(MRI.getRegClass(FalseReg) == RC); 1372 1373 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1374 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1375 1376 // Limit to equal cost for branch vs. N v_cndmask_b32s. 1377 return !RI.isSGPRClass(RC) && NumInsts <= 6; 1378 } 1379 case SCC_TRUE: 1380 case SCC_FALSE: { 1381 // FIXME: We could insert for VGPRs if we could replace the original compare 1382 // with a vector one. 1383 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1384 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1385 assert(MRI.getRegClass(FalseReg) == RC); 1386 1387 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1388 1389 // Multiples of 8 can do s_cselect_b64 1390 if (NumInsts % 2 == 0) 1391 NumInsts /= 2; 1392 1393 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1394 return RI.isSGPRClass(RC); 1395 } 1396 default: 1397 return false; 1398 } 1399 } 1400 1401 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 1402 MachineBasicBlock::iterator I, const DebugLoc &DL, 1403 unsigned DstReg, ArrayRef<MachineOperand> Cond, 1404 unsigned TrueReg, unsigned FalseReg) const { 1405 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 1406 if (Pred == VCCZ || Pred == SCC_FALSE) { 1407 Pred = static_cast<BranchPredicate>(-Pred); 1408 std::swap(TrueReg, FalseReg); 1409 } 1410 1411 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1412 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 1413 unsigned DstSize = DstRC->getSize(); 1414 1415 if (DstSize == 4) { 1416 unsigned SelOp = Pred == SCC_TRUE ? 1417 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 1418 1419 // Instruction's operands are backwards from what is expected. 1420 MachineInstr *Select = 1421 BuildMI(MBB, I, DL, get(SelOp), DstReg) 1422 .addReg(FalseReg) 1423 .addReg(TrueReg); 1424 1425 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1426 return; 1427 } 1428 1429 if (DstSize == 8 && Pred == SCC_TRUE) { 1430 MachineInstr *Select = 1431 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 1432 .addReg(FalseReg) 1433 .addReg(TrueReg); 1434 1435 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1436 return; 1437 } 1438 1439 static const int16_t Sub0_15[] = { 1440 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1441 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1442 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1443 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1444 }; 1445 1446 static const int16_t Sub0_15_64[] = { 1447 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1448 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1449 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1450 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1451 }; 1452 1453 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 1454 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 1455 const int16_t *SubIndices = Sub0_15; 1456 int NElts = DstSize / 4; 1457 1458 // 64-bit select is only avaialble for SALU. 1459 if (Pred == SCC_TRUE) { 1460 SelOp = AMDGPU::S_CSELECT_B64; 1461 EltRC = &AMDGPU::SGPR_64RegClass; 1462 SubIndices = Sub0_15_64; 1463 1464 assert(NElts % 2 == 0); 1465 NElts /= 2; 1466 } 1467 1468 MachineInstrBuilder MIB = BuildMI( 1469 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 1470 1471 I = MIB->getIterator(); 1472 1473 SmallVector<unsigned, 8> Regs; 1474 for (int Idx = 0; Idx != NElts; ++Idx) { 1475 unsigned DstElt = MRI.createVirtualRegister(EltRC); 1476 Regs.push_back(DstElt); 1477 1478 unsigned SubIdx = SubIndices[Idx]; 1479 1480 MachineInstr *Select = 1481 BuildMI(MBB, I, DL, get(SelOp), DstElt) 1482 .addReg(FalseReg, 0, SubIdx) 1483 .addReg(TrueReg, 0, SubIdx); 1484 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1485 1486 MIB.addReg(DstElt) 1487 .addImm(SubIdx); 1488 } 1489 } 1490 1491 static void removeModOperands(MachineInstr &MI) { 1492 unsigned Opc = MI.getOpcode(); 1493 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1494 AMDGPU::OpName::src0_modifiers); 1495 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1496 AMDGPU::OpName::src1_modifiers); 1497 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1498 AMDGPU::OpName::src2_modifiers); 1499 1500 MI.RemoveOperand(Src2ModIdx); 1501 MI.RemoveOperand(Src1ModIdx); 1502 MI.RemoveOperand(Src0ModIdx); 1503 } 1504 1505 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1506 unsigned Reg, MachineRegisterInfo *MRI) const { 1507 if (!MRI->hasOneNonDBGUse(Reg)) 1508 return false; 1509 1510 unsigned Opc = UseMI.getOpcode(); 1511 if (Opc == AMDGPU::COPY) { 1512 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1513 switch (DefMI.getOpcode()) { 1514 default: 1515 return false; 1516 case AMDGPU::S_MOV_B64: 1517 // TODO: We could fold 64-bit immediates, but this get compilicated 1518 // when there are sub-registers. 1519 return false; 1520 1521 case AMDGPU::V_MOV_B32_e32: 1522 case AMDGPU::S_MOV_B32: 1523 break; 1524 } 1525 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1526 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1527 assert(ImmOp); 1528 // FIXME: We could handle FrameIndex values here. 1529 if (!ImmOp->isImm()) { 1530 return false; 1531 } 1532 UseMI.setDesc(get(NewOpc)); 1533 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1534 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1535 return true; 1536 } 1537 1538 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 1539 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 1540 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 1541 1542 // Don't fold if we are using source modifiers. The new VOP2 instructions 1543 // don't have them. 1544 if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || 1545 hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || 1546 hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { 1547 return false; 1548 } 1549 1550 const MachineOperand &ImmOp = DefMI.getOperand(1); 1551 1552 // If this is a free constant, there's no reason to do this. 1553 // TODO: We could fold this here instead of letting SIFoldOperands do it 1554 // later. 1555 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1556 1557 // Any src operand can be used for the legality check. 1558 if (isInlineConstant(UseMI, *Src0, ImmOp)) 1559 return false; 1560 1561 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1562 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1563 1564 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 1565 // We should only expect these to be on src0 due to canonicalizations. 1566 if (Src0->isReg() && Src0->getReg() == Reg) { 1567 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1568 return false; 1569 1570 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1571 return false; 1572 1573 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1574 1575 const int64_t Imm = DefMI.getOperand(1).getImm(); 1576 1577 // FIXME: This would be a lot easier if we could return a new instruction 1578 // instead of having to modify in place. 1579 1580 // Remove these first since they are at the end. 1581 UseMI.RemoveOperand( 1582 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1583 UseMI.RemoveOperand( 1584 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1585 1586 unsigned Src1Reg = Src1->getReg(); 1587 unsigned Src1SubReg = Src1->getSubReg(); 1588 Src0->setReg(Src1Reg); 1589 Src0->setSubReg(Src1SubReg); 1590 Src0->setIsKill(Src1->isKill()); 1591 1592 if (Opc == AMDGPU::V_MAC_F32_e64 || 1593 Opc == AMDGPU::V_MAC_F16_e64) 1594 UseMI.untieRegOperand( 1595 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1596 1597 Src1->ChangeToImmediate(Imm); 1598 1599 removeModOperands(UseMI); 1600 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 1601 1602 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1603 if (DeleteDef) 1604 DefMI.eraseFromParent(); 1605 1606 return true; 1607 } 1608 1609 // Added part is the constant: Use v_madak_{f16, f32}. 1610 if (Src2->isReg() && Src2->getReg() == Reg) { 1611 // Not allowed to use constant bus for another operand. 1612 // We can however allow an inline immediate as src0. 1613 if (!Src0->isImm() && 1614 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1615 return false; 1616 1617 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1618 return false; 1619 1620 const int64_t Imm = DefMI.getOperand(1).getImm(); 1621 1622 // FIXME: This would be a lot easier if we could return a new instruction 1623 // instead of having to modify in place. 1624 1625 // Remove these first since they are at the end. 1626 UseMI.RemoveOperand( 1627 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1628 UseMI.RemoveOperand( 1629 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1630 1631 if (Opc == AMDGPU::V_MAC_F32_e64 || 1632 Opc == AMDGPU::V_MAC_F16_e64) 1633 UseMI.untieRegOperand( 1634 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1635 1636 // ChangingToImmediate adds Src2 back to the instruction. 1637 Src2->ChangeToImmediate(Imm); 1638 1639 // These come before src2. 1640 removeModOperands(UseMI); 1641 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 1642 1643 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1644 if (DeleteDef) 1645 DefMI.eraseFromParent(); 1646 1647 return true; 1648 } 1649 } 1650 1651 return false; 1652 } 1653 1654 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1655 int WidthB, int OffsetB) { 1656 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1657 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1658 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1659 return LowOffset + LowWidth <= HighOffset; 1660 } 1661 1662 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 1663 MachineInstr &MIb) const { 1664 unsigned BaseReg0, BaseReg1; 1665 int64_t Offset0, Offset1; 1666 1667 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1668 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1669 1670 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 1671 // FIXME: Handle ds_read2 / ds_write2. 1672 return false; 1673 } 1674 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 1675 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 1676 if (BaseReg0 == BaseReg1 && 1677 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1678 return true; 1679 } 1680 } 1681 1682 return false; 1683 } 1684 1685 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 1686 MachineInstr &MIb, 1687 AliasAnalysis *AA) const { 1688 assert((MIa.mayLoad() || MIa.mayStore()) && 1689 "MIa must load from or modify a memory location"); 1690 assert((MIb.mayLoad() || MIb.mayStore()) && 1691 "MIb must load from or modify a memory location"); 1692 1693 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 1694 return false; 1695 1696 // XXX - Can we relax this between address spaces? 1697 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1698 return false; 1699 1700 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 1701 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 1702 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 1703 if (MMOa->getValue() && MMOb->getValue()) { 1704 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 1705 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 1706 if (!AA->alias(LocA, LocB)) 1707 return true; 1708 } 1709 } 1710 1711 // TODO: Should we check the address space from the MachineMemOperand? That 1712 // would allow us to distinguish objects we know don't alias based on the 1713 // underlying address space, even if it was lowered to a different one, 1714 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1715 // buffer. 1716 if (isDS(MIa)) { 1717 if (isDS(MIb)) 1718 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1719 1720 return !isFLAT(MIb); 1721 } 1722 1723 if (isMUBUF(MIa) || isMTBUF(MIa)) { 1724 if (isMUBUF(MIb) || isMTBUF(MIb)) 1725 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1726 1727 return !isFLAT(MIb) && !isSMRD(MIb); 1728 } 1729 1730 if (isSMRD(MIa)) { 1731 if (isSMRD(MIb)) 1732 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1733 1734 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 1735 } 1736 1737 if (isFLAT(MIa)) { 1738 if (isFLAT(MIb)) 1739 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1740 1741 return false; 1742 } 1743 1744 return false; 1745 } 1746 1747 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1748 MachineInstr &MI, 1749 LiveVariables *LV) const { 1750 bool IsF16 = false; 1751 1752 switch (MI.getOpcode()) { 1753 default: 1754 return nullptr; 1755 case AMDGPU::V_MAC_F16_e64: 1756 IsF16 = true; 1757 case AMDGPU::V_MAC_F32_e64: 1758 break; 1759 case AMDGPU::V_MAC_F16_e32: 1760 IsF16 = true; 1761 case AMDGPU::V_MAC_F32_e32: { 1762 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1763 AMDGPU::OpName::src0); 1764 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 1765 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 1766 return nullptr; 1767 break; 1768 } 1769 } 1770 1771 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 1772 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1773 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 1774 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 1775 1776 return BuildMI(*MBB, MI, MI.getDebugLoc(), 1777 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) 1778 .add(*Dst) 1779 .addImm(0) // Src0 mods 1780 .add(*Src0) 1781 .addImm(0) // Src1 mods 1782 .add(*Src1) 1783 .addImm(0) // Src mods 1784 .add(*Src2) 1785 .addImm(0) // clamp 1786 .addImm(0); // omod 1787 } 1788 1789 // It's not generally safe to move VALU instructions across these since it will 1790 // start using the register as a base index rather than directly. 1791 // XXX - Why isn't hasSideEffects sufficient for these? 1792 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 1793 switch (MI.getOpcode()) { 1794 case AMDGPU::S_SET_GPR_IDX_ON: 1795 case AMDGPU::S_SET_GPR_IDX_MODE: 1796 case AMDGPU::S_SET_GPR_IDX_OFF: 1797 return true; 1798 default: 1799 return false; 1800 } 1801 } 1802 1803 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1804 const MachineBasicBlock *MBB, 1805 const MachineFunction &MF) const { 1806 // XXX - Do we want the SP check in the base implementation? 1807 1808 // Target-independent instructions do not have an implicit-use of EXEC, even 1809 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1810 // boundaries prevents incorrect movements of such instructions. 1811 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 1812 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 1813 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 1814 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 1815 changesVGPRIndexingMode(MI); 1816 } 1817 1818 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1819 switch (Imm.getBitWidth()) { 1820 case 32: 1821 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 1822 ST.hasInv2PiInlineImm()); 1823 case 64: 1824 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 1825 ST.hasInv2PiInlineImm()); 1826 case 16: 1827 return ST.has16BitInsts() && 1828 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 1829 ST.hasInv2PiInlineImm()); 1830 default: 1831 llvm_unreachable("invalid bitwidth"); 1832 } 1833 } 1834 1835 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1836 uint8_t OperandType) const { 1837 if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) 1838 return false; 1839 1840 // MachineOperand provides no way to tell the true operand size, since it only 1841 // records a 64-bit value. We need to know the size to determine if a 32-bit 1842 // floating point immediate bit pattern is legal for an integer immediate. It 1843 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 1844 1845 int64_t Imm = MO.getImm(); 1846 switch (operandBitWidth(OperandType)) { 1847 case 32: { 1848 int32_t Trunc = static_cast<int32_t>(Imm); 1849 return Trunc == Imm && 1850 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 1851 } 1852 case 64: { 1853 return AMDGPU::isInlinableLiteral64(MO.getImm(), 1854 ST.hasInv2PiInlineImm()); 1855 } 1856 case 16: { 1857 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 1858 // A few special case instructions have 16-bit operands on subtargets 1859 // where 16-bit instructions are not legal. 1860 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 1861 // constants in these cases 1862 int16_t Trunc = static_cast<int16_t>(Imm); 1863 return ST.has16BitInsts() && 1864 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 1865 } 1866 1867 return false; 1868 } 1869 default: 1870 llvm_unreachable("invalid bitwidth"); 1871 } 1872 } 1873 1874 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 1875 const MCOperandInfo &OpInfo) const { 1876 switch (MO.getType()) { 1877 case MachineOperand::MO_Register: 1878 return false; 1879 case MachineOperand::MO_Immediate: 1880 return !isInlineConstant(MO, OpInfo); 1881 case MachineOperand::MO_FrameIndex: 1882 case MachineOperand::MO_MachineBasicBlock: 1883 case MachineOperand::MO_ExternalSymbol: 1884 case MachineOperand::MO_GlobalAddress: 1885 case MachineOperand::MO_MCSymbol: 1886 return true; 1887 default: 1888 llvm_unreachable("unexpected operand type"); 1889 } 1890 } 1891 1892 static bool compareMachineOp(const MachineOperand &Op0, 1893 const MachineOperand &Op1) { 1894 if (Op0.getType() != Op1.getType()) 1895 return false; 1896 1897 switch (Op0.getType()) { 1898 case MachineOperand::MO_Register: 1899 return Op0.getReg() == Op1.getReg(); 1900 case MachineOperand::MO_Immediate: 1901 return Op0.getImm() == Op1.getImm(); 1902 default: 1903 llvm_unreachable("Didn't expect to be comparing these operand types"); 1904 } 1905 } 1906 1907 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 1908 const MachineOperand &MO) const { 1909 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 1910 1911 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1912 1913 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1914 return true; 1915 1916 if (OpInfo.RegClass < 0) 1917 return false; 1918 1919 if (MO.isImm() && isInlineConstant(MO, OpInfo)) 1920 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1921 1922 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1923 } 1924 1925 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1926 int Op32 = AMDGPU::getVOPe32(Opcode); 1927 if (Op32 == -1) 1928 return false; 1929 1930 return pseudoToMCOpcode(Op32) != -1; 1931 } 1932 1933 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1934 // The src0_modifier operand is present on all instructions 1935 // that have modifiers. 1936 1937 return AMDGPU::getNamedOperandIdx(Opcode, 1938 AMDGPU::OpName::src0_modifiers) != -1; 1939 } 1940 1941 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1942 unsigned OpName) const { 1943 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1944 return Mods && Mods->getImm(); 1945 } 1946 1947 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1948 const MachineOperand &MO, 1949 const MCOperandInfo &OpInfo) const { 1950 // Literal constants use the constant bus. 1951 //if (isLiteralConstantLike(MO, OpInfo)) 1952 // return true; 1953 if (MO.isImm()) 1954 return !isInlineConstant(MO, OpInfo); 1955 1956 if (!MO.isReg()) 1957 return true; // Misc other operands like FrameIndex 1958 1959 if (!MO.isUse()) 1960 return false; 1961 1962 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1963 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1964 1965 // FLAT_SCR is just an SGPR pair. 1966 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1967 return true; 1968 1969 // EXEC register uses the constant bus. 1970 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1971 return true; 1972 1973 // SGPRs use the constant bus 1974 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1975 (!MO.isImplicit() && 1976 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1977 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1978 } 1979 1980 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1981 for (const MachineOperand &MO : MI.implicit_operands()) { 1982 // We only care about reads. 1983 if (MO.isDef()) 1984 continue; 1985 1986 switch (MO.getReg()) { 1987 case AMDGPU::VCC: 1988 case AMDGPU::M0: 1989 case AMDGPU::FLAT_SCR: 1990 return MO.getReg(); 1991 1992 default: 1993 break; 1994 } 1995 } 1996 1997 return AMDGPU::NoRegister; 1998 } 1999 2000 static bool shouldReadExec(const MachineInstr &MI) { 2001 if (SIInstrInfo::isVALU(MI)) { 2002 switch (MI.getOpcode()) { 2003 case AMDGPU::V_READLANE_B32: 2004 case AMDGPU::V_READLANE_B32_si: 2005 case AMDGPU::V_READLANE_B32_vi: 2006 case AMDGPU::V_WRITELANE_B32: 2007 case AMDGPU::V_WRITELANE_B32_si: 2008 case AMDGPU::V_WRITELANE_B32_vi: 2009 return false; 2010 } 2011 2012 return true; 2013 } 2014 2015 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 2016 SIInstrInfo::isSALU(MI) || 2017 SIInstrInfo::isSMRD(MI)) 2018 return false; 2019 2020 return true; 2021 } 2022 2023 static bool isSubRegOf(const SIRegisterInfo &TRI, 2024 const MachineOperand &SuperVec, 2025 const MachineOperand &SubReg) { 2026 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 2027 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 2028 2029 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 2030 SubReg.getReg() == SuperVec.getReg(); 2031 } 2032 2033 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 2034 StringRef &ErrInfo) const { 2035 uint16_t Opcode = MI.getOpcode(); 2036 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2037 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 2038 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 2039 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 2040 2041 // Make sure the number of operands is correct. 2042 const MCInstrDesc &Desc = get(Opcode); 2043 if (!Desc.isVariadic() && 2044 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 2045 ErrInfo = "Instruction has wrong number of operands."; 2046 return false; 2047 } 2048 2049 if (MI.isInlineAsm()) { 2050 // Verify register classes for inlineasm constraints. 2051 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 2052 I != E; ++I) { 2053 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 2054 if (!RC) 2055 continue; 2056 2057 const MachineOperand &Op = MI.getOperand(I); 2058 if (!Op.isReg()) 2059 continue; 2060 2061 unsigned Reg = Op.getReg(); 2062 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 2063 ErrInfo = "inlineasm operand has incorrect register class."; 2064 return false; 2065 } 2066 } 2067 2068 return true; 2069 } 2070 2071 // Make sure the register classes are correct. 2072 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 2073 if (MI.getOperand(i).isFPImm()) { 2074 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 2075 "all fp values to integers."; 2076 return false; 2077 } 2078 2079 int RegClass = Desc.OpInfo[i].RegClass; 2080 2081 switch (Desc.OpInfo[i].OperandType) { 2082 case MCOI::OPERAND_REGISTER: 2083 if (MI.getOperand(i).isImm()) { 2084 ErrInfo = "Illegal immediate value for operand."; 2085 return false; 2086 } 2087 break; 2088 case AMDGPU::OPERAND_REG_IMM_INT32: 2089 case AMDGPU::OPERAND_REG_IMM_FP32: 2090 break; 2091 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2092 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2093 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2094 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2095 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2096 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2097 const MachineOperand &MO = MI.getOperand(i); 2098 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 2099 ErrInfo = "Illegal immediate value for operand."; 2100 return false; 2101 } 2102 break; 2103 } 2104 case MCOI::OPERAND_IMMEDIATE: 2105 case AMDGPU::OPERAND_KIMM32: 2106 // Check if this operand is an immediate. 2107 // FrameIndex operands will be replaced by immediates, so they are 2108 // allowed. 2109 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 2110 ErrInfo = "Expected immediate, but got non-immediate"; 2111 return false; 2112 } 2113 LLVM_FALLTHROUGH; 2114 default: 2115 continue; 2116 } 2117 2118 if (!MI.getOperand(i).isReg()) 2119 continue; 2120 2121 if (RegClass != -1) { 2122 unsigned Reg = MI.getOperand(i).getReg(); 2123 if (Reg == AMDGPU::NoRegister || 2124 TargetRegisterInfo::isVirtualRegister(Reg)) 2125 continue; 2126 2127 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 2128 if (!RC->contains(Reg)) { 2129 ErrInfo = "Operand has incorrect register class."; 2130 return false; 2131 } 2132 } 2133 } 2134 2135 // Verify VOP* 2136 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { 2137 // Only look at the true operands. Only a real operand can use the constant 2138 // bus, and we don't want to check pseudo-operands like the source modifier 2139 // flags. 2140 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 2141 2142 unsigned ConstantBusCount = 0; 2143 2144 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 2145 ++ConstantBusCount; 2146 2147 unsigned SGPRUsed = findImplicitSGPRRead(MI); 2148 if (SGPRUsed != AMDGPU::NoRegister) 2149 ++ConstantBusCount; 2150 2151 for (int OpIdx : OpIndices) { 2152 if (OpIdx == -1) 2153 break; 2154 const MachineOperand &MO = MI.getOperand(OpIdx); 2155 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 2156 if (MO.isReg()) { 2157 if (MO.getReg() != SGPRUsed) 2158 ++ConstantBusCount; 2159 SGPRUsed = MO.getReg(); 2160 } else { 2161 ++ConstantBusCount; 2162 } 2163 } 2164 } 2165 if (ConstantBusCount > 1) { 2166 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2167 return false; 2168 } 2169 } 2170 2171 // Verify misc. restrictions on specific instructions. 2172 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2173 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2174 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2175 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2176 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2177 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2178 if (!compareMachineOp(Src0, Src1) && 2179 !compareMachineOp(Src0, Src2)) { 2180 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2181 return false; 2182 } 2183 } 2184 } 2185 2186 if (isSOPK(MI)) { 2187 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2188 if (sopkIsZext(MI)) { 2189 if (!isUInt<16>(Imm)) { 2190 ErrInfo = "invalid immediate for SOPK instruction"; 2191 return false; 2192 } 2193 } else { 2194 if (!isInt<16>(Imm)) { 2195 ErrInfo = "invalid immediate for SOPK instruction"; 2196 return false; 2197 } 2198 } 2199 } 2200 2201 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2202 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2203 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2204 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2205 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2206 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2207 2208 const unsigned StaticNumOps = Desc.getNumOperands() + 2209 Desc.getNumImplicitUses(); 2210 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2211 2212 // Allow additional implicit operands. This allows a fixup done by the post 2213 // RA scheduler where the main implicit operand is killed and implicit-defs 2214 // are added for sub-registers that remain live after this instruction. 2215 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 2216 ErrInfo = "missing implicit register operands"; 2217 return false; 2218 } 2219 2220 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2221 if (IsDst) { 2222 if (!Dst->isUse()) { 2223 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2224 return false; 2225 } 2226 2227 unsigned UseOpIdx; 2228 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2229 UseOpIdx != StaticNumOps + 1) { 2230 ErrInfo = "movrel implicit operands should be tied"; 2231 return false; 2232 } 2233 } 2234 2235 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2236 const MachineOperand &ImpUse 2237 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2238 if (!ImpUse.isReg() || !ImpUse.isUse() || 2239 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2240 ErrInfo = "src0 should be subreg of implicit vector use"; 2241 return false; 2242 } 2243 } 2244 2245 // Make sure we aren't losing exec uses in the td files. This mostly requires 2246 // being careful when using let Uses to try to add other use registers. 2247 if (shouldReadExec(MI)) { 2248 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2249 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2250 return false; 2251 } 2252 } 2253 2254 if (isSMRD(MI)) { 2255 if (MI.mayStore()) { 2256 // The register offset form of scalar stores may only use m0 as the 2257 // soffset register. 2258 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 2259 if (Soff && Soff->getReg() != AMDGPU::M0) { 2260 ErrInfo = "scalar stores must use m0 as offset register"; 2261 return false; 2262 } 2263 } 2264 } 2265 2266 return true; 2267 } 2268 2269 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 2270 switch (MI.getOpcode()) { 2271 default: return AMDGPU::INSTRUCTION_LIST_END; 2272 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2273 case AMDGPU::COPY: return AMDGPU::COPY; 2274 case AMDGPU::PHI: return AMDGPU::PHI; 2275 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2276 case AMDGPU::S_MOV_B32: 2277 return MI.getOperand(1).isReg() ? 2278 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2279 case AMDGPU::S_ADD_I32: 2280 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 2281 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 2282 case AMDGPU::S_SUB_I32: 2283 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 2284 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2285 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2286 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2287 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2288 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2289 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2290 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2291 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2292 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2293 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2294 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2295 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2296 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2297 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2298 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2299 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2300 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2301 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2302 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2303 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2304 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2305 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2306 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2307 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2308 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2309 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2310 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2311 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2312 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2313 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2314 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2315 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2316 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2317 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2318 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2319 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2320 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2321 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2322 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2323 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2324 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2325 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2326 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2327 } 2328 } 2329 2330 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 2331 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 2332 } 2333 2334 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2335 unsigned OpNo) const { 2336 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2337 const MCInstrDesc &Desc = get(MI.getOpcode()); 2338 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2339 Desc.OpInfo[OpNo].RegClass == -1) { 2340 unsigned Reg = MI.getOperand(OpNo).getReg(); 2341 2342 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2343 return MRI.getRegClass(Reg); 2344 return RI.getPhysRegClass(Reg); 2345 } 2346 2347 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2348 return RI.getRegClass(RCID); 2349 } 2350 2351 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2352 switch (MI.getOpcode()) { 2353 case AMDGPU::COPY: 2354 case AMDGPU::REG_SEQUENCE: 2355 case AMDGPU::PHI: 2356 case AMDGPU::INSERT_SUBREG: 2357 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2358 default: 2359 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2360 } 2361 } 2362 2363 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2364 MachineBasicBlock::iterator I = MI; 2365 MachineBasicBlock *MBB = MI.getParent(); 2366 MachineOperand &MO = MI.getOperand(OpIdx); 2367 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2368 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2369 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2370 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2371 if (MO.isReg()) 2372 Opcode = AMDGPU::COPY; 2373 else if (RI.isSGPRClass(RC)) 2374 Opcode = AMDGPU::S_MOV_B32; 2375 2376 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 2377 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 2378 VRC = &AMDGPU::VReg_64RegClass; 2379 else 2380 VRC = &AMDGPU::VGPR_32RegClass; 2381 2382 unsigned Reg = MRI.createVirtualRegister(VRC); 2383 DebugLoc DL = MBB->findDebugLoc(I); 2384 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 2385 MO.ChangeToRegister(Reg, false); 2386 } 2387 2388 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 2389 MachineRegisterInfo &MRI, 2390 MachineOperand &SuperReg, 2391 const TargetRegisterClass *SuperRC, 2392 unsigned SubIdx, 2393 const TargetRegisterClass *SubRC) 2394 const { 2395 MachineBasicBlock *MBB = MI->getParent(); 2396 DebugLoc DL = MI->getDebugLoc(); 2397 unsigned SubReg = MRI.createVirtualRegister(SubRC); 2398 2399 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 2400 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2401 .addReg(SuperReg.getReg(), 0, SubIdx); 2402 return SubReg; 2403 } 2404 2405 // Just in case the super register is itself a sub-register, copy it to a new 2406 // value so we don't need to worry about merging its subreg index with the 2407 // SubIdx passed to this function. The register coalescer should be able to 2408 // eliminate this extra copy. 2409 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 2410 2411 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 2412 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 2413 2414 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2415 .addReg(NewSuperReg, 0, SubIdx); 2416 2417 return SubReg; 2418 } 2419 2420 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 2421 MachineBasicBlock::iterator MII, 2422 MachineRegisterInfo &MRI, 2423 MachineOperand &Op, 2424 const TargetRegisterClass *SuperRC, 2425 unsigned SubIdx, 2426 const TargetRegisterClass *SubRC) const { 2427 if (Op.isImm()) { 2428 if (SubIdx == AMDGPU::sub0) 2429 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 2430 if (SubIdx == AMDGPU::sub1) 2431 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 2432 2433 llvm_unreachable("Unhandled register index for immediate"); 2434 } 2435 2436 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2437 SubIdx, SubRC); 2438 return MachineOperand::CreateReg(SubReg, false); 2439 } 2440 2441 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2442 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2443 assert(Inst.getNumExplicitOperands() == 3); 2444 MachineOperand Op1 = Inst.getOperand(1); 2445 Inst.RemoveOperand(1); 2446 Inst.addOperand(Op1); 2447 } 2448 2449 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 2450 const MCOperandInfo &OpInfo, 2451 const MachineOperand &MO) const { 2452 if (!MO.isReg()) 2453 return false; 2454 2455 unsigned Reg = MO.getReg(); 2456 const TargetRegisterClass *RC = 2457 TargetRegisterInfo::isVirtualRegister(Reg) ? 2458 MRI.getRegClass(Reg) : 2459 RI.getPhysRegClass(Reg); 2460 2461 const SIRegisterInfo *TRI = 2462 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 2463 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 2464 2465 // In order to be legal, the common sub-class must be equal to the 2466 // class of the current operand. For example: 2467 // 2468 // v_mov_b32 s0 ; Operand defined as vsrc_b32 2469 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 2470 // 2471 // s_sendmsg 0, s0 ; Operand defined as m0reg 2472 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 2473 2474 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 2475 } 2476 2477 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 2478 const MCOperandInfo &OpInfo, 2479 const MachineOperand &MO) const { 2480 if (MO.isReg()) 2481 return isLegalRegOperand(MRI, OpInfo, MO); 2482 2483 // Handle non-register types that are treated like immediates. 2484 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2485 return true; 2486 } 2487 2488 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 2489 const MachineOperand *MO) const { 2490 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2491 const MCInstrDesc &InstDesc = MI.getDesc(); 2492 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 2493 const TargetRegisterClass *DefinedRC = 2494 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 2495 if (!MO) 2496 MO = &MI.getOperand(OpIdx); 2497 2498 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 2499 2500 RegSubRegPair SGPRUsed; 2501 if (MO->isReg()) 2502 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 2503 2504 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2505 if (i == OpIdx) 2506 continue; 2507 const MachineOperand &Op = MI.getOperand(i); 2508 if (Op.isReg()) { 2509 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 2510 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 2511 return false; 2512 } 2513 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 2514 return false; 2515 } 2516 } 2517 } 2518 2519 if (MO->isReg()) { 2520 assert(DefinedRC); 2521 return isLegalRegOperand(MRI, OpInfo, *MO); 2522 } 2523 2524 // Handle non-register types that are treated like immediates. 2525 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 2526 2527 if (!DefinedRC) { 2528 // This operand expects an immediate. 2529 return true; 2530 } 2531 2532 return isImmOperandLegal(MI, OpIdx, *MO); 2533 } 2534 2535 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 2536 MachineInstr &MI) const { 2537 unsigned Opc = MI.getOpcode(); 2538 const MCInstrDesc &InstrDesc = get(Opc); 2539 2540 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2541 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2542 2543 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 2544 // we need to only have one constant bus use. 2545 // 2546 // Note we do not need to worry about literal constants here. They are 2547 // disabled for the operand type for instructions because they will always 2548 // violate the one constant bus use rule. 2549 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 2550 if (HasImplicitSGPR) { 2551 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2552 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2553 2554 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 2555 legalizeOpWithMove(MI, Src0Idx); 2556 } 2557 2558 // VOP2 src0 instructions support all operand types, so we don't need to check 2559 // their legality. If src1 is already legal, we don't need to do anything. 2560 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 2561 return; 2562 2563 // We do not use commuteInstruction here because it is too aggressive and will 2564 // commute if it is possible. We only want to commute here if it improves 2565 // legality. This can be called a fairly large number of times so don't waste 2566 // compile time pointlessly swapping and checking legality again. 2567 if (HasImplicitSGPR || !MI.isCommutable()) { 2568 legalizeOpWithMove(MI, Src1Idx); 2569 return; 2570 } 2571 2572 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2573 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2574 2575 // If src0 can be used as src1, commuting will make the operands legal. 2576 // Otherwise we have to give up and insert a move. 2577 // 2578 // TODO: Other immediate-like operand kinds could be commuted if there was a 2579 // MachineOperand::ChangeTo* for them. 2580 if ((!Src1.isImm() && !Src1.isReg()) || 2581 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 2582 legalizeOpWithMove(MI, Src1Idx); 2583 return; 2584 } 2585 2586 int CommutedOpc = commuteOpcode(MI); 2587 if (CommutedOpc == -1) { 2588 legalizeOpWithMove(MI, Src1Idx); 2589 return; 2590 } 2591 2592 MI.setDesc(get(CommutedOpc)); 2593 2594 unsigned Src0Reg = Src0.getReg(); 2595 unsigned Src0SubReg = Src0.getSubReg(); 2596 bool Src0Kill = Src0.isKill(); 2597 2598 if (Src1.isImm()) 2599 Src0.ChangeToImmediate(Src1.getImm()); 2600 else if (Src1.isReg()) { 2601 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 2602 Src0.setSubReg(Src1.getSubReg()); 2603 } else 2604 llvm_unreachable("Should only have register or immediate operands"); 2605 2606 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 2607 Src1.setSubReg(Src0SubReg); 2608 } 2609 2610 // Legalize VOP3 operands. Because all operand types are supported for any 2611 // operand, and since literal constants are not allowed and should never be 2612 // seen, we only need to worry about inserting copies if we use multiple SGPR 2613 // operands. 2614 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 2615 MachineInstr &MI) const { 2616 unsigned Opc = MI.getOpcode(); 2617 2618 int VOP3Idx[3] = { 2619 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 2620 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 2621 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 2622 }; 2623 2624 // Find the one SGPR operand we are allowed to use. 2625 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 2626 2627 for (unsigned i = 0; i < 3; ++i) { 2628 int Idx = VOP3Idx[i]; 2629 if (Idx == -1) 2630 break; 2631 MachineOperand &MO = MI.getOperand(Idx); 2632 2633 // We should never see a VOP3 instruction with an illegal immediate operand. 2634 if (!MO.isReg()) 2635 continue; 2636 2637 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2638 continue; // VGPRs are legal 2639 2640 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 2641 SGPRReg = MO.getReg(); 2642 // We can use one SGPR in each VOP3 instruction. 2643 continue; 2644 } 2645 2646 // If we make it this far, then the operand is not legal and we must 2647 // legalize it. 2648 legalizeOpWithMove(MI, Idx); 2649 } 2650 } 2651 2652 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 2653 MachineRegisterInfo &MRI) const { 2654 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2655 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2656 unsigned DstReg = MRI.createVirtualRegister(SRC); 2657 unsigned SubRegs = VRC->getSize() / 4; 2658 2659 SmallVector<unsigned, 8> SRegs; 2660 for (unsigned i = 0; i < SubRegs; ++i) { 2661 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2662 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2663 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2664 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2665 SRegs.push_back(SGPR); 2666 } 2667 2668 MachineInstrBuilder MIB = 2669 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2670 get(AMDGPU::REG_SEQUENCE), DstReg); 2671 for (unsigned i = 0; i < SubRegs; ++i) { 2672 MIB.addReg(SRegs[i]); 2673 MIB.addImm(RI.getSubRegFromChannel(i)); 2674 } 2675 return DstReg; 2676 } 2677 2678 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2679 MachineInstr &MI) const { 2680 2681 // If the pointer is store in VGPRs, then we need to move them to 2682 // SGPRs using v_readfirstlane. This is safe because we only select 2683 // loads with uniform pointers to SMRD instruction so we know the 2684 // pointer value is uniform. 2685 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 2686 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2687 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2688 SBase->setReg(SGPR); 2689 } 2690 } 2691 2692 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 2693 MachineBasicBlock::iterator I, 2694 const TargetRegisterClass *DstRC, 2695 MachineOperand &Op, 2696 MachineRegisterInfo &MRI, 2697 const DebugLoc &DL) const { 2698 2699 unsigned OpReg = Op.getReg(); 2700 unsigned OpSubReg = Op.getSubReg(); 2701 2702 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 2703 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 2704 2705 // Check if operand is already the correct register class. 2706 if (DstRC == OpRC) 2707 return; 2708 2709 unsigned DstReg = MRI.createVirtualRegister(DstRC); 2710 MachineInstr *Copy = 2711 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 2712 2713 Op.setReg(DstReg); 2714 Op.setSubReg(0); 2715 2716 MachineInstr *Def = MRI.getVRegDef(OpReg); 2717 if (!Def) 2718 return; 2719 2720 // Try to eliminate the copy if it is copying an immediate value. 2721 if (Def->isMoveImmediate()) 2722 FoldImmediate(*Copy, *Def, OpReg, &MRI); 2723 } 2724 2725 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 2726 MachineFunction &MF = *MI.getParent()->getParent(); 2727 MachineRegisterInfo &MRI = MF.getRegInfo(); 2728 2729 // Legalize VOP2 2730 if (isVOP2(MI) || isVOPC(MI)) { 2731 legalizeOperandsVOP2(MRI, MI); 2732 return; 2733 } 2734 2735 // Legalize VOP3 2736 if (isVOP3(MI)) { 2737 legalizeOperandsVOP3(MRI, MI); 2738 return; 2739 } 2740 2741 // Legalize SMRD 2742 if (isSMRD(MI)) { 2743 legalizeOperandsSMRD(MRI, MI); 2744 return; 2745 } 2746 2747 // Legalize REG_SEQUENCE and PHI 2748 // The register class of the operands much be the same type as the register 2749 // class of the output. 2750 if (MI.getOpcode() == AMDGPU::PHI) { 2751 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2752 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 2753 if (!MI.getOperand(i).isReg() || 2754 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 2755 continue; 2756 const TargetRegisterClass *OpRC = 2757 MRI.getRegClass(MI.getOperand(i).getReg()); 2758 if (RI.hasVGPRs(OpRC)) { 2759 VRC = OpRC; 2760 } else { 2761 SRC = OpRC; 2762 } 2763 } 2764 2765 // If any of the operands are VGPR registers, then they all most be 2766 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2767 // them. 2768 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 2769 if (!VRC) { 2770 assert(SRC); 2771 VRC = RI.getEquivalentVGPRClass(SRC); 2772 } 2773 RC = VRC; 2774 } else { 2775 RC = SRC; 2776 } 2777 2778 // Update all the operands so they have the same type. 2779 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2780 MachineOperand &Op = MI.getOperand(I); 2781 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2782 continue; 2783 2784 // MI is a PHI instruction. 2785 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 2786 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2787 2788 // Avoid creating no-op copies with the same src and dst reg class. These 2789 // confuse some of the machine passes. 2790 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 2791 } 2792 } 2793 2794 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2795 // VGPR dest type and SGPR sources, insert copies so all operands are 2796 // VGPRs. This seems to help operand folding / the register coalescer. 2797 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 2798 MachineBasicBlock *MBB = MI.getParent(); 2799 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 2800 if (RI.hasVGPRs(DstRC)) { 2801 // Update all the operands so they are VGPR register classes. These may 2802 // not be the same register class because REG_SEQUENCE supports mixing 2803 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2804 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2805 MachineOperand &Op = MI.getOperand(I); 2806 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2807 continue; 2808 2809 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2810 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2811 if (VRC == OpRC) 2812 continue; 2813 2814 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 2815 Op.setIsKill(); 2816 } 2817 } 2818 2819 return; 2820 } 2821 2822 // Legalize INSERT_SUBREG 2823 // src0 must have the same register class as dst 2824 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 2825 unsigned Dst = MI.getOperand(0).getReg(); 2826 unsigned Src0 = MI.getOperand(1).getReg(); 2827 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2828 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2829 if (DstRC != Src0RC) { 2830 MachineBasicBlock *MBB = MI.getParent(); 2831 MachineOperand &Op = MI.getOperand(1); 2832 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 2833 } 2834 return; 2835 } 2836 2837 // Legalize MIMG and MUBUF/MTBUF for shaders. 2838 // 2839 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 2840 // scratch memory access. In both cases, the legalization never involves 2841 // conversion to the addr64 form. 2842 if (isMIMG(MI) || 2843 (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && 2844 (isMUBUF(MI) || isMTBUF(MI)))) { 2845 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 2846 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2847 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2848 SRsrc->setReg(SGPR); 2849 } 2850 2851 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 2852 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2853 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2854 SSamp->setReg(SGPR); 2855 } 2856 return; 2857 } 2858 2859 // Legalize MUBUF* instructions by converting to addr64 form. 2860 // FIXME: If we start using the non-addr64 instructions for compute, we 2861 // may need to legalize them as above. This especially applies to the 2862 // buffer_load_format_* variants and variants with idxen (or bothen). 2863 int SRsrcIdx = 2864 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 2865 if (SRsrcIdx != -1) { 2866 // We have an MUBUF instruction 2867 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 2868 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 2869 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2870 RI.getRegClass(SRsrcRC))) { 2871 // The operands are legal. 2872 // FIXME: We may need to legalize operands besided srsrc. 2873 return; 2874 } 2875 2876 MachineBasicBlock &MBB = *MI.getParent(); 2877 2878 // Extract the ptr from the resource descriptor. 2879 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2880 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2881 2882 // Create an empty resource descriptor 2883 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2884 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2885 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2886 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2887 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2888 2889 // Zero64 = 0 2890 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 2891 .addImm(0); 2892 2893 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2894 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 2895 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2896 2897 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2898 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 2899 .addImm(RsrcDataFormat >> 32); 2900 2901 // NewSRsrc = {Zero64, SRsrcFormat} 2902 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2903 .addReg(Zero64) 2904 .addImm(AMDGPU::sub0_sub1) 2905 .addReg(SRsrcFormatLo) 2906 .addImm(AMDGPU::sub2) 2907 .addReg(SRsrcFormatHi) 2908 .addImm(AMDGPU::sub3); 2909 2910 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 2911 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2912 if (VAddr) { 2913 // This is already an ADDR64 instruction so we need to add the pointer 2914 // extracted from the resource descriptor to the current value of VAddr. 2915 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2916 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2917 2918 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2919 DebugLoc DL = MI.getDebugLoc(); 2920 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2921 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2922 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2923 2924 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2925 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2926 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2927 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2928 2929 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2930 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2931 .addReg(NewVAddrLo) 2932 .addImm(AMDGPU::sub0) 2933 .addReg(NewVAddrHi) 2934 .addImm(AMDGPU::sub1); 2935 } else { 2936 // This instructions is the _OFFSET variant, so we need to convert it to 2937 // ADDR64. 2938 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 2939 < SISubtarget::VOLCANIC_ISLANDS && 2940 "FIXME: Need to emit flat atomics here"); 2941 2942 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 2943 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 2944 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 2945 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 2946 2947 // Atomics rith return have have an additional tied operand and are 2948 // missing some of the special bits. 2949 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 2950 MachineInstr *Addr64; 2951 2952 if (!VDataIn) { 2953 // Regular buffer load / store. 2954 MachineInstrBuilder MIB = 2955 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2956 .add(*VData) 2957 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2958 // This will be replaced later 2959 // with the new value of vaddr. 2960 .add(*SRsrc) 2961 .add(*SOffset) 2962 .add(*Offset); 2963 2964 // Atomics do not have this operand. 2965 if (const MachineOperand *GLC = 2966 getNamedOperand(MI, AMDGPU::OpName::glc)) { 2967 MIB.addImm(GLC->getImm()); 2968 } 2969 2970 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 2971 2972 if (const MachineOperand *TFE = 2973 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 2974 MIB.addImm(TFE->getImm()); 2975 } 2976 2977 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2978 Addr64 = MIB; 2979 } else { 2980 // Atomics with return. 2981 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2982 .add(*VData) 2983 .add(*VDataIn) 2984 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2985 // This will be replaced later 2986 // with the new value of vaddr. 2987 .add(*SRsrc) 2988 .add(*SOffset) 2989 .add(*Offset) 2990 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 2991 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2992 } 2993 2994 MI.removeFromParent(); 2995 2996 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2997 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 2998 NewVAddr) 2999 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3000 .addImm(AMDGPU::sub0) 3001 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3002 .addImm(AMDGPU::sub1); 3003 3004 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 3005 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 3006 } 3007 3008 // Update the instruction to use NewVaddr 3009 VAddr->setReg(NewVAddr); 3010 // Update the instruction to use NewSRsrc 3011 SRsrc->setReg(NewSRsrc); 3012 } 3013 } 3014 3015 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 3016 SmallVector<MachineInstr *, 128> Worklist; 3017 Worklist.push_back(&TopInst); 3018 3019 while (!Worklist.empty()) { 3020 MachineInstr &Inst = *Worklist.pop_back_val(); 3021 MachineBasicBlock *MBB = Inst.getParent(); 3022 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3023 3024 unsigned Opcode = Inst.getOpcode(); 3025 unsigned NewOpcode = getVALUOp(Inst); 3026 3027 // Handle some special cases 3028 switch (Opcode) { 3029 default: 3030 break; 3031 case AMDGPU::S_AND_B64: 3032 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 3033 Inst.eraseFromParent(); 3034 continue; 3035 3036 case AMDGPU::S_OR_B64: 3037 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 3038 Inst.eraseFromParent(); 3039 continue; 3040 3041 case AMDGPU::S_XOR_B64: 3042 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 3043 Inst.eraseFromParent(); 3044 continue; 3045 3046 case AMDGPU::S_NOT_B64: 3047 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 3048 Inst.eraseFromParent(); 3049 continue; 3050 3051 case AMDGPU::S_BCNT1_I32_B64: 3052 splitScalar64BitBCNT(Worklist, Inst); 3053 Inst.eraseFromParent(); 3054 continue; 3055 3056 case AMDGPU::S_BFE_I64: { 3057 splitScalar64BitBFE(Worklist, Inst); 3058 Inst.eraseFromParent(); 3059 continue; 3060 } 3061 3062 case AMDGPU::S_LSHL_B32: 3063 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3064 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 3065 swapOperands(Inst); 3066 } 3067 break; 3068 case AMDGPU::S_ASHR_I32: 3069 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3070 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 3071 swapOperands(Inst); 3072 } 3073 break; 3074 case AMDGPU::S_LSHR_B32: 3075 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3076 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 3077 swapOperands(Inst); 3078 } 3079 break; 3080 case AMDGPU::S_LSHL_B64: 3081 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3082 NewOpcode = AMDGPU::V_LSHLREV_B64; 3083 swapOperands(Inst); 3084 } 3085 break; 3086 case AMDGPU::S_ASHR_I64: 3087 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3088 NewOpcode = AMDGPU::V_ASHRREV_I64; 3089 swapOperands(Inst); 3090 } 3091 break; 3092 case AMDGPU::S_LSHR_B64: 3093 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3094 NewOpcode = AMDGPU::V_LSHRREV_B64; 3095 swapOperands(Inst); 3096 } 3097 break; 3098 3099 case AMDGPU::S_ABS_I32: 3100 lowerScalarAbs(Worklist, Inst); 3101 Inst.eraseFromParent(); 3102 continue; 3103 3104 case AMDGPU::S_CBRANCH_SCC0: 3105 case AMDGPU::S_CBRANCH_SCC1: 3106 // Clear unused bits of vcc 3107 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 3108 AMDGPU::VCC) 3109 .addReg(AMDGPU::EXEC) 3110 .addReg(AMDGPU::VCC); 3111 break; 3112 3113 case AMDGPU::S_BFE_U64: 3114 case AMDGPU::S_BFM_B64: 3115 llvm_unreachable("Moving this op to VALU not implemented"); 3116 } 3117 3118 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 3119 // We cannot move this instruction to the VALU, so we should try to 3120 // legalize its operands instead. 3121 legalizeOperands(Inst); 3122 continue; 3123 } 3124 3125 // Use the new VALU Opcode. 3126 const MCInstrDesc &NewDesc = get(NewOpcode); 3127 Inst.setDesc(NewDesc); 3128 3129 // Remove any references to SCC. Vector instructions can't read from it, and 3130 // We're just about to add the implicit use / defs of VCC, and we don't want 3131 // both. 3132 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 3133 MachineOperand &Op = Inst.getOperand(i); 3134 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 3135 Inst.RemoveOperand(i); 3136 addSCCDefUsersToVALUWorklist(Inst, Worklist); 3137 } 3138 } 3139 3140 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 3141 // We are converting these to a BFE, so we need to add the missing 3142 // operands for the size and offset. 3143 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 3144 Inst.addOperand(MachineOperand::CreateImm(0)); 3145 Inst.addOperand(MachineOperand::CreateImm(Size)); 3146 3147 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 3148 // The VALU version adds the second operand to the result, so insert an 3149 // extra 0 operand. 3150 Inst.addOperand(MachineOperand::CreateImm(0)); 3151 } 3152 3153 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 3154 3155 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 3156 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 3157 // If we need to move this to VGPRs, we need to unpack the second operand 3158 // back into the 2 separate ones for bit offset and width. 3159 assert(OffsetWidthOp.isImm() && 3160 "Scalar BFE is only implemented for constant width and offset"); 3161 uint32_t Imm = OffsetWidthOp.getImm(); 3162 3163 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3164 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3165 Inst.RemoveOperand(2); // Remove old immediate. 3166 Inst.addOperand(MachineOperand::CreateImm(Offset)); 3167 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 3168 } 3169 3170 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 3171 unsigned NewDstReg = AMDGPU::NoRegister; 3172 if (HasDst) { 3173 // Update the destination register class. 3174 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 3175 if (!NewDstRC) 3176 continue; 3177 3178 unsigned DstReg = Inst.getOperand(0).getReg(); 3179 if (Inst.isCopy() && 3180 TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && 3181 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 3182 // Instead of creating a copy where src and dst are the same register 3183 // class, we just replace all uses of dst with src. These kinds of 3184 // copies interfere with the heuristics MachineSink uses to decide 3185 // whether or not to split a critical edge. Since the pass assumes 3186 // that copies will end up as machine instructions and not be 3187 // eliminated. 3188 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 3189 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 3190 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 3191 Inst.getOperand(0).setReg(DstReg); 3192 continue; 3193 } 3194 3195 NewDstReg = MRI.createVirtualRegister(NewDstRC); 3196 MRI.replaceRegWith(DstReg, NewDstReg); 3197 } 3198 3199 // Legalize the operands 3200 legalizeOperands(Inst); 3201 3202 if (HasDst) 3203 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 3204 } 3205 } 3206 3207 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 3208 MachineInstr &Inst) const { 3209 MachineBasicBlock &MBB = *Inst.getParent(); 3210 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3211 MachineBasicBlock::iterator MII = Inst; 3212 DebugLoc DL = Inst.getDebugLoc(); 3213 3214 MachineOperand &Dest = Inst.getOperand(0); 3215 MachineOperand &Src = Inst.getOperand(1); 3216 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3217 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3218 3219 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 3220 .addImm(0) 3221 .addReg(Src.getReg()); 3222 3223 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 3224 .addReg(Src.getReg()) 3225 .addReg(TmpReg); 3226 3227 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3228 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3229 } 3230 3231 void SIInstrInfo::splitScalar64BitUnaryOp( 3232 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3233 unsigned Opcode) const { 3234 MachineBasicBlock &MBB = *Inst.getParent(); 3235 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3236 3237 MachineOperand &Dest = Inst.getOperand(0); 3238 MachineOperand &Src0 = Inst.getOperand(1); 3239 DebugLoc DL = Inst.getDebugLoc(); 3240 3241 MachineBasicBlock::iterator MII = Inst; 3242 3243 const MCInstrDesc &InstDesc = get(Opcode); 3244 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3245 MRI.getRegClass(Src0.getReg()) : 3246 &AMDGPU::SGPR_32RegClass; 3247 3248 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3249 3250 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3251 AMDGPU::sub0, Src0SubRC); 3252 3253 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3254 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3255 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3256 3257 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3258 BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 3259 3260 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3261 AMDGPU::sub1, Src0SubRC); 3262 3263 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3264 BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 3265 3266 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3267 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3268 .addReg(DestSub0) 3269 .addImm(AMDGPU::sub0) 3270 .addReg(DestSub1) 3271 .addImm(AMDGPU::sub1); 3272 3273 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3274 3275 // We don't need to legalizeOperands here because for a single operand, src0 3276 // will support any kind of input. 3277 3278 // Move all users of this moved value. 3279 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3280 } 3281 3282 void SIInstrInfo::splitScalar64BitBinaryOp( 3283 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3284 unsigned Opcode) const { 3285 MachineBasicBlock &MBB = *Inst.getParent(); 3286 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3287 3288 MachineOperand &Dest = Inst.getOperand(0); 3289 MachineOperand &Src0 = Inst.getOperand(1); 3290 MachineOperand &Src1 = Inst.getOperand(2); 3291 DebugLoc DL = Inst.getDebugLoc(); 3292 3293 MachineBasicBlock::iterator MII = Inst; 3294 3295 const MCInstrDesc &InstDesc = get(Opcode); 3296 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3297 MRI.getRegClass(Src0.getReg()) : 3298 &AMDGPU::SGPR_32RegClass; 3299 3300 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3301 const TargetRegisterClass *Src1RC = Src1.isReg() ? 3302 MRI.getRegClass(Src1.getReg()) : 3303 &AMDGPU::SGPR_32RegClass; 3304 3305 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 3306 3307 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3308 AMDGPU::sub0, Src0SubRC); 3309 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3310 AMDGPU::sub0, Src1SubRC); 3311 3312 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3313 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3314 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3315 3316 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3317 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3318 .add(SrcReg0Sub0) 3319 .add(SrcReg1Sub0); 3320 3321 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3322 AMDGPU::sub1, Src0SubRC); 3323 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3324 AMDGPU::sub1, Src1SubRC); 3325 3326 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3327 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3328 .add(SrcReg0Sub1) 3329 .add(SrcReg1Sub1); 3330 3331 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3332 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3333 .addReg(DestSub0) 3334 .addImm(AMDGPU::sub0) 3335 .addReg(DestSub1) 3336 .addImm(AMDGPU::sub1); 3337 3338 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3339 3340 // Try to legalize the operands in case we need to swap the order to keep it 3341 // valid. 3342 legalizeOperands(LoHalf); 3343 legalizeOperands(HiHalf); 3344 3345 // Move all users of this moved vlaue. 3346 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3347 } 3348 3349 void SIInstrInfo::splitScalar64BitBCNT( 3350 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { 3351 MachineBasicBlock &MBB = *Inst.getParent(); 3352 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3353 3354 MachineBasicBlock::iterator MII = Inst; 3355 DebugLoc DL = Inst.getDebugLoc(); 3356 3357 MachineOperand &Dest = Inst.getOperand(0); 3358 MachineOperand &Src = Inst.getOperand(1); 3359 3360 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 3361 const TargetRegisterClass *SrcRC = Src.isReg() ? 3362 MRI.getRegClass(Src.getReg()) : 3363 &AMDGPU::SGPR_32RegClass; 3364 3365 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3366 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3367 3368 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 3369 3370 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3371 AMDGPU::sub0, SrcSubRC); 3372 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3373 AMDGPU::sub1, SrcSubRC); 3374 3375 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 3376 3377 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 3378 3379 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3380 3381 // We don't need to legalize operands here. src0 for etiher instruction can be 3382 // an SGPR, and the second input is unused or determined here. 3383 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3384 } 3385 3386 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 3387 MachineInstr &Inst) const { 3388 MachineBasicBlock &MBB = *Inst.getParent(); 3389 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3390 MachineBasicBlock::iterator MII = Inst; 3391 DebugLoc DL = Inst.getDebugLoc(); 3392 3393 MachineOperand &Dest = Inst.getOperand(0); 3394 uint32_t Imm = Inst.getOperand(2).getImm(); 3395 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3396 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3397 3398 (void) Offset; 3399 3400 // Only sext_inreg cases handled. 3401 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 3402 Offset == 0 && "Not implemented"); 3403 3404 if (BitWidth < 32) { 3405 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3406 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3407 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3408 3409 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 3410 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 3411 .addImm(0) 3412 .addImm(BitWidth); 3413 3414 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 3415 .addImm(31) 3416 .addReg(MidRegLo); 3417 3418 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3419 .addReg(MidRegLo) 3420 .addImm(AMDGPU::sub0) 3421 .addReg(MidRegHi) 3422 .addImm(AMDGPU::sub1); 3423 3424 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3425 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3426 return; 3427 } 3428 3429 MachineOperand &Src = Inst.getOperand(1); 3430 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3431 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3432 3433 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 3434 .addImm(31) 3435 .addReg(Src.getReg(), 0, AMDGPU::sub0); 3436 3437 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3438 .addReg(Src.getReg(), 0, AMDGPU::sub0) 3439 .addImm(AMDGPU::sub0) 3440 .addReg(TmpReg) 3441 .addImm(AMDGPU::sub1); 3442 3443 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3444 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3445 } 3446 3447 void SIInstrInfo::addUsersToMoveToVALUWorklist( 3448 unsigned DstReg, 3449 MachineRegisterInfo &MRI, 3450 SmallVectorImpl<MachineInstr *> &Worklist) const { 3451 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 3452 E = MRI.use_end(); I != E;) { 3453 MachineInstr &UseMI = *I->getParent(); 3454 if (!canReadVGPR(UseMI, I.getOperandNo())) { 3455 Worklist.push_back(&UseMI); 3456 3457 do { 3458 ++I; 3459 } while (I != E && I->getParent() == &UseMI); 3460 } else { 3461 ++I; 3462 } 3463 } 3464 } 3465 3466 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 3467 MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { 3468 // This assumes that all the users of SCC are in the same block 3469 // as the SCC def. 3470 for (MachineInstr &MI : 3471 llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), 3472 SCCDefInst.getParent()->end())) { 3473 // Exit if we find another SCC def. 3474 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 3475 return; 3476 3477 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 3478 Worklist.push_back(&MI); 3479 } 3480 } 3481 3482 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 3483 const MachineInstr &Inst) const { 3484 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 3485 3486 switch (Inst.getOpcode()) { 3487 // For target instructions, getOpRegClass just returns the virtual register 3488 // class associated with the operand, so we need to find an equivalent VGPR 3489 // register class in order to move the instruction to the VALU. 3490 case AMDGPU::COPY: 3491 case AMDGPU::PHI: 3492 case AMDGPU::REG_SEQUENCE: 3493 case AMDGPU::INSERT_SUBREG: 3494 if (RI.hasVGPRs(NewDstRC)) 3495 return nullptr; 3496 3497 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 3498 if (!NewDstRC) 3499 return nullptr; 3500 return NewDstRC; 3501 default: 3502 return NewDstRC; 3503 } 3504 } 3505 3506 // Find the one SGPR operand we are allowed to use. 3507 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 3508 int OpIndices[3]) const { 3509 const MCInstrDesc &Desc = MI.getDesc(); 3510 3511 // Find the one SGPR operand we are allowed to use. 3512 // 3513 // First we need to consider the instruction's operand requirements before 3514 // legalizing. Some operands are required to be SGPRs, such as implicit uses 3515 // of VCC, but we are still bound by the constant bus requirement to only use 3516 // one. 3517 // 3518 // If the operand's class is an SGPR, we can never move it. 3519 3520 unsigned SGPRReg = findImplicitSGPRRead(MI); 3521 if (SGPRReg != AMDGPU::NoRegister) 3522 return SGPRReg; 3523 3524 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 3525 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3526 3527 for (unsigned i = 0; i < 3; ++i) { 3528 int Idx = OpIndices[i]; 3529 if (Idx == -1) 3530 break; 3531 3532 const MachineOperand &MO = MI.getOperand(Idx); 3533 if (!MO.isReg()) 3534 continue; 3535 3536 // Is this operand statically required to be an SGPR based on the operand 3537 // constraints? 3538 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 3539 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 3540 if (IsRequiredSGPR) 3541 return MO.getReg(); 3542 3543 // If this could be a VGPR or an SGPR, Check the dynamic register class. 3544 unsigned Reg = MO.getReg(); 3545 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 3546 if (RI.isSGPRClass(RegRC)) 3547 UsedSGPRs[i] = Reg; 3548 } 3549 3550 // We don't have a required SGPR operand, so we have a bit more freedom in 3551 // selecting operands to move. 3552 3553 // Try to select the most used SGPR. If an SGPR is equal to one of the 3554 // others, we choose that. 3555 // 3556 // e.g. 3557 // V_FMA_F32 v0, s0, s0, s0 -> No moves 3558 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 3559 3560 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 3561 // prefer those. 3562 3563 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 3564 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 3565 SGPRReg = UsedSGPRs[0]; 3566 } 3567 3568 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 3569 if (UsedSGPRs[1] == UsedSGPRs[2]) 3570 SGPRReg = UsedSGPRs[1]; 3571 } 3572 3573 return SGPRReg; 3574 } 3575 3576 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 3577 unsigned OperandName) const { 3578 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 3579 if (Idx == -1) 3580 return nullptr; 3581 3582 return &MI.getOperand(Idx); 3583 } 3584 3585 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 3586 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 3587 if (ST.isAmdHsaOS()) { 3588 RsrcDataFormat |= (1ULL << 56); 3589 3590 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3591 // Set MTYPE = 2 3592 RsrcDataFormat |= (2ULL << 59); 3593 } 3594 3595 return RsrcDataFormat; 3596 } 3597 3598 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 3599 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 3600 AMDGPU::RSRC_TID_ENABLE | 3601 0xffffffff; // Size; 3602 3603 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 3604 3605 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | 3606 // IndexStride = 64 3607 (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); 3608 3609 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 3610 // Clear them unless we want a huge stride. 3611 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3612 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 3613 3614 return Rsrc23; 3615 } 3616 3617 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 3618 unsigned Opc = MI.getOpcode(); 3619 3620 return isSMRD(Opc); 3621 } 3622 3623 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 3624 unsigned Opc = MI.getOpcode(); 3625 3626 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 3627 } 3628 3629 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 3630 int &FrameIndex) const { 3631 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3632 if (!Addr || !Addr->isFI()) 3633 return AMDGPU::NoRegister; 3634 3635 assert(!MI.memoperands_empty() && 3636 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 3637 3638 FrameIndex = Addr->getIndex(); 3639 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 3640 } 3641 3642 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 3643 int &FrameIndex) const { 3644 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 3645 assert(Addr && Addr->isFI()); 3646 FrameIndex = Addr->getIndex(); 3647 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 3648 } 3649 3650 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 3651 int &FrameIndex) const { 3652 3653 if (!MI.mayLoad()) 3654 return AMDGPU::NoRegister; 3655 3656 if (isMUBUF(MI) || isVGPRSpill(MI)) 3657 return isStackAccess(MI, FrameIndex); 3658 3659 if (isSGPRSpill(MI)) 3660 return isSGPRStackAccess(MI, FrameIndex); 3661 3662 return AMDGPU::NoRegister; 3663 } 3664 3665 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 3666 int &FrameIndex) const { 3667 if (!MI.mayStore()) 3668 return AMDGPU::NoRegister; 3669 3670 if (isMUBUF(MI) || isVGPRSpill(MI)) 3671 return isStackAccess(MI, FrameIndex); 3672 3673 if (isSGPRSpill(MI)) 3674 return isSGPRStackAccess(MI, FrameIndex); 3675 3676 return AMDGPU::NoRegister; 3677 } 3678 3679 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 3680 unsigned Opc = MI.getOpcode(); 3681 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 3682 unsigned DescSize = Desc.getSize(); 3683 3684 // If we have a definitive size, we can use it. Otherwise we need to inspect 3685 // the operands to know the size. 3686 // 3687 // FIXME: Instructions that have a base 32-bit encoding report their size as 3688 // 4, even though they are really 8 bytes if they have a literal operand. 3689 if (DescSize != 0 && DescSize != 4) 3690 return DescSize; 3691 3692 if (Opc == AMDGPU::WAVE_BARRIER) 3693 return 0; 3694 3695 // 4-byte instructions may have a 32-bit literal encoded after them. Check 3696 // operands that coud ever be literals. 3697 if (isVALU(MI) || isSALU(MI)) { 3698 if (isFixedSize(MI)) { 3699 assert(DescSize == 4); 3700 return DescSize; 3701 } 3702 3703 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3704 if (Src0Idx == -1) 3705 return 4; // No operands. 3706 3707 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 3708 return 8; 3709 3710 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3711 if (Src1Idx == -1) 3712 return 4; 3713 3714 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 3715 return 8; 3716 3717 return 4; 3718 } 3719 3720 if (DescSize == 4) 3721 return 4; 3722 3723 switch (Opc) { 3724 case AMDGPU::SI_MASK_BRANCH: 3725 case TargetOpcode::IMPLICIT_DEF: 3726 case TargetOpcode::KILL: 3727 case TargetOpcode::DBG_VALUE: 3728 case TargetOpcode::BUNDLE: 3729 case TargetOpcode::EH_LABEL: 3730 return 0; 3731 case TargetOpcode::INLINEASM: { 3732 const MachineFunction *MF = MI.getParent()->getParent(); 3733 const char *AsmStr = MI.getOperand(0).getSymbolName(); 3734 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 3735 } 3736 default: 3737 llvm_unreachable("unable to find instruction size"); 3738 } 3739 } 3740 3741 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 3742 if (!isFLAT(MI)) 3743 return false; 3744 3745 if (MI.memoperands_empty()) 3746 return true; 3747 3748 for (const MachineMemOperand *MMO : MI.memoperands()) { 3749 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 3750 return true; 3751 } 3752 return false; 3753 } 3754 3755 ArrayRef<std::pair<int, const char *>> 3756 SIInstrInfo::getSerializableTargetIndices() const { 3757 static const std::pair<int, const char *> TargetIndices[] = { 3758 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 3759 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 3760 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 3761 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 3762 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 3763 return makeArrayRef(TargetIndices); 3764 } 3765 3766 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 3767 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 3768 ScheduleHazardRecognizer * 3769 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 3770 const ScheduleDAG *DAG) const { 3771 return new GCNHazardRecognizer(DAG->MF); 3772 } 3773 3774 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 3775 /// pass. 3776 ScheduleHazardRecognizer * 3777 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 3778 return new GCNHazardRecognizer(MF); 3779 } 3780 3781 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 3782 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 3783 MI.modifiesRegister(AMDGPU::EXEC, &RI); 3784 } 3785