1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/ScheduleDAG.h" 24 #include "llvm/IR/Function.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/Debug.h" 28 29 using namespace llvm; 30 31 // Must be at least 4 to be able to branch over minimum unconditional branch 32 // code. This is only for making it possible to write reasonably small tests for 33 // long branches. 34 static cl::opt<unsigned> 35 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 36 cl::desc("Restrict range of branch instructions (DEBUG)")); 37 38 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 39 : AMDGPUInstrInfo(ST), RI(), ST(ST) {} 40 41 //===----------------------------------------------------------------------===// 42 // TargetInstrInfo callbacks 43 //===----------------------------------------------------------------------===// 44 45 static unsigned getNumOperandsNoGlue(SDNode *Node) { 46 unsigned N = Node->getNumOperands(); 47 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 48 --N; 49 return N; 50 } 51 52 static SDValue findChainOperand(SDNode *Load) { 53 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 54 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 55 return LastOp; 56 } 57 58 /// \brief Returns true if both nodes have the same value for the given 59 /// operand \p Op, or if both nodes do not have this operand. 60 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 61 unsigned Opc0 = N0->getMachineOpcode(); 62 unsigned Opc1 = N1->getMachineOpcode(); 63 64 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 65 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 66 67 if (Op0Idx == -1 && Op1Idx == -1) 68 return true; 69 70 71 if ((Op0Idx == -1 && Op1Idx != -1) || 72 (Op1Idx == -1 && Op0Idx != -1)) 73 return false; 74 75 // getNamedOperandIdx returns the index for the MachineInstr's operands, 76 // which includes the result as the first operand. We are indexing into the 77 // MachineSDNode's operands, so we need to skip the result operand to get 78 // the real index. 79 --Op0Idx; 80 --Op1Idx; 81 82 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 83 } 84 85 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 86 AliasAnalysis *AA) const { 87 // TODO: The generic check fails for VALU instructions that should be 88 // rematerializable due to implicit reads of exec. We really want all of the 89 // generic logic for this except for this. 90 switch (MI.getOpcode()) { 91 case AMDGPU::V_MOV_B32_e32: 92 case AMDGPU::V_MOV_B32_e64: 93 case AMDGPU::V_MOV_B64_PSEUDO: 94 return true; 95 default: 96 return false; 97 } 98 } 99 100 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 101 int64_t &Offset0, 102 int64_t &Offset1) const { 103 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 104 return false; 105 106 unsigned Opc0 = Load0->getMachineOpcode(); 107 unsigned Opc1 = Load1->getMachineOpcode(); 108 109 // Make sure both are actually loads. 110 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 111 return false; 112 113 if (isDS(Opc0) && isDS(Opc1)) { 114 115 // FIXME: Handle this case: 116 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 117 return false; 118 119 // Check base reg. 120 if (Load0->getOperand(1) != Load1->getOperand(1)) 121 return false; 122 123 // Check chain. 124 if (findChainOperand(Load0) != findChainOperand(Load1)) 125 return false; 126 127 // Skip read2 / write2 variants for simplicity. 128 // TODO: We should report true if the used offsets are adjacent (excluded 129 // st64 versions). 130 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 131 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 132 return false; 133 134 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 135 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 136 return true; 137 } 138 139 if (isSMRD(Opc0) && isSMRD(Opc1)) { 140 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 141 142 // Check base reg. 143 if (Load0->getOperand(0) != Load1->getOperand(0)) 144 return false; 145 146 const ConstantSDNode *Load0Offset = 147 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 148 const ConstantSDNode *Load1Offset = 149 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 150 151 if (!Load0Offset || !Load1Offset) 152 return false; 153 154 // Check chain. 155 if (findChainOperand(Load0) != findChainOperand(Load1)) 156 return false; 157 158 Offset0 = Load0Offset->getZExtValue(); 159 Offset1 = Load1Offset->getZExtValue(); 160 return true; 161 } 162 163 // MUBUF and MTBUF can access the same addresses. 164 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 165 166 // MUBUF and MTBUF have vaddr at different indices. 167 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 168 findChainOperand(Load0) != findChainOperand(Load1) || 169 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 170 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 171 return false; 172 173 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 174 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 175 176 if (OffIdx0 == -1 || OffIdx1 == -1) 177 return false; 178 179 // getNamedOperandIdx returns the index for MachineInstrs. Since they 180 // inlcude the output in the operand list, but SDNodes don't, we need to 181 // subtract the index by one. 182 --OffIdx0; 183 --OffIdx1; 184 185 SDValue Off0 = Load0->getOperand(OffIdx0); 186 SDValue Off1 = Load1->getOperand(OffIdx1); 187 188 // The offset might be a FrameIndexSDNode. 189 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 190 return false; 191 192 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 193 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 194 return true; 195 } 196 197 return false; 198 } 199 200 static bool isStride64(unsigned Opc) { 201 switch (Opc) { 202 case AMDGPU::DS_READ2ST64_B32: 203 case AMDGPU::DS_READ2ST64_B64: 204 case AMDGPU::DS_WRITE2ST64_B32: 205 case AMDGPU::DS_WRITE2ST64_B64: 206 return true; 207 default: 208 return false; 209 } 210 } 211 212 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 213 int64_t &Offset, 214 const TargetRegisterInfo *TRI) const { 215 unsigned Opc = LdSt.getOpcode(); 216 217 if (isDS(LdSt)) { 218 const MachineOperand *OffsetImm = 219 getNamedOperand(LdSt, AMDGPU::OpName::offset); 220 if (OffsetImm) { 221 // Normal, single offset LDS instruction. 222 const MachineOperand *AddrReg = 223 getNamedOperand(LdSt, AMDGPU::OpName::addr); 224 225 BaseReg = AddrReg->getReg(); 226 Offset = OffsetImm->getImm(); 227 return true; 228 } 229 230 // The 2 offset instructions use offset0 and offset1 instead. We can treat 231 // these as a load with a single offset if the 2 offsets are consecutive. We 232 // will use this for some partially aligned loads. 233 const MachineOperand *Offset0Imm = 234 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 235 const MachineOperand *Offset1Imm = 236 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 237 238 uint8_t Offset0 = Offset0Imm->getImm(); 239 uint8_t Offset1 = Offset1Imm->getImm(); 240 241 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 242 // Each of these offsets is in element sized units, so we need to convert 243 // to bytes of the individual reads. 244 245 unsigned EltSize; 246 if (LdSt.mayLoad()) 247 EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; 248 else { 249 assert(LdSt.mayStore()); 250 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 251 EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); 252 } 253 254 if (isStride64(Opc)) 255 EltSize *= 64; 256 257 const MachineOperand *AddrReg = 258 getNamedOperand(LdSt, AMDGPU::OpName::addr); 259 BaseReg = AddrReg->getReg(); 260 Offset = EltSize * Offset0; 261 return true; 262 } 263 264 return false; 265 } 266 267 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 268 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 269 return false; 270 271 const MachineOperand *AddrReg = 272 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 273 if (!AddrReg) 274 return false; 275 276 const MachineOperand *OffsetImm = 277 getNamedOperand(LdSt, AMDGPU::OpName::offset); 278 BaseReg = AddrReg->getReg(); 279 Offset = OffsetImm->getImm(); 280 return true; 281 } 282 283 if (isSMRD(LdSt)) { 284 const MachineOperand *OffsetImm = 285 getNamedOperand(LdSt, AMDGPU::OpName::offset); 286 if (!OffsetImm) 287 return false; 288 289 const MachineOperand *SBaseReg = 290 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 291 BaseReg = SBaseReg->getReg(); 292 Offset = OffsetImm->getImm(); 293 return true; 294 } 295 296 if (isFLAT(LdSt)) { 297 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); 298 BaseReg = AddrReg->getReg(); 299 Offset = 0; 300 return true; 301 } 302 303 return false; 304 } 305 306 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 307 MachineInstr &SecondLdSt, 308 unsigned NumLoads) const { 309 const MachineOperand *FirstDst = nullptr; 310 const MachineOperand *SecondDst = nullptr; 311 312 if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 313 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 314 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 315 } 316 317 if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 318 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 319 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 320 } 321 322 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 323 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { 324 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 325 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 326 } 327 328 if (!FirstDst || !SecondDst) 329 return false; 330 331 // Try to limit clustering based on the total number of bytes loaded 332 // rather than the number of instructions. This is done to help reduce 333 // register pressure. The method used is somewhat inexact, though, 334 // because it assumes that all loads in the cluster will load the 335 // same number of bytes as FirstLdSt. 336 337 // The unit of this value is bytes. 338 // FIXME: This needs finer tuning. 339 unsigned LoadClusterThreshold = 16; 340 341 const MachineRegisterInfo &MRI = 342 FirstLdSt.getParent()->getParent()->getRegInfo(); 343 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 344 345 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 346 } 347 348 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 349 MachineBasicBlock::iterator MI, 350 const DebugLoc &DL, unsigned DestReg, 351 unsigned SrcReg, bool KillSrc) const { 352 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 353 354 if (RC == &AMDGPU::VGPR_32RegClass) { 355 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 356 AMDGPU::SReg_32RegClass.contains(SrcReg)); 357 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 358 .addReg(SrcReg, getKillRegState(KillSrc)); 359 return; 360 } 361 362 if (RC == &AMDGPU::SReg_32RegClass) { 363 if (SrcReg == AMDGPU::SCC) { 364 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 365 .addImm(-1) 366 .addImm(0); 367 return; 368 } 369 370 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 371 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 372 .addReg(SrcReg, getKillRegState(KillSrc)); 373 return; 374 } 375 376 if (RC == &AMDGPU::SReg_64RegClass) { 377 if (DestReg == AMDGPU::VCC) { 378 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 379 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 380 .addReg(SrcReg, getKillRegState(KillSrc)); 381 } else { 382 // FIXME: Hack until VReg_1 removed. 383 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 384 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 385 .addImm(0) 386 .addReg(SrcReg, getKillRegState(KillSrc)); 387 } 388 389 return; 390 } 391 392 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 393 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 394 .addReg(SrcReg, getKillRegState(KillSrc)); 395 return; 396 } 397 398 if (DestReg == AMDGPU::SCC) { 399 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 400 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 401 .addReg(SrcReg, getKillRegState(KillSrc)) 402 .addImm(0); 403 return; 404 } 405 406 unsigned EltSize = 4; 407 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 408 if (RI.isSGPRClass(RC)) { 409 if (RC->getSize() > 4) { 410 Opcode = AMDGPU::S_MOV_B64; 411 EltSize = 8; 412 } else { 413 Opcode = AMDGPU::S_MOV_B32; 414 EltSize = 4; 415 } 416 } 417 418 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 419 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 420 421 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 422 unsigned SubIdx; 423 if (Forward) 424 SubIdx = SubIndices[Idx]; 425 else 426 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 427 428 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 429 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 430 431 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 432 433 if (Idx == SubIndices.size() - 1) 434 Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 435 436 if (Idx == 0) 437 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 438 439 Builder.addReg(SrcReg, RegState::Implicit); 440 } 441 } 442 443 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 444 int NewOpc; 445 446 // Try to map original to commuted opcode 447 NewOpc = AMDGPU::getCommuteRev(Opcode); 448 if (NewOpc != -1) 449 // Check if the commuted (REV) opcode exists on the target. 450 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 451 452 // Try to map commuted to original opcode 453 NewOpc = AMDGPU::getCommuteOrig(Opcode); 454 if (NewOpc != -1) 455 // Check if the original (non-REV) opcode exists on the target. 456 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 457 458 return Opcode; 459 } 460 461 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 462 463 if (DstRC->getSize() == 4) { 464 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 465 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 466 return AMDGPU::S_MOV_B64; 467 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 468 return AMDGPU::V_MOV_B64_PSEUDO; 469 } 470 return AMDGPU::COPY; 471 } 472 473 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 474 switch (Size) { 475 case 4: 476 return AMDGPU::SI_SPILL_S32_SAVE; 477 case 8: 478 return AMDGPU::SI_SPILL_S64_SAVE; 479 case 16: 480 return AMDGPU::SI_SPILL_S128_SAVE; 481 case 32: 482 return AMDGPU::SI_SPILL_S256_SAVE; 483 case 64: 484 return AMDGPU::SI_SPILL_S512_SAVE; 485 default: 486 llvm_unreachable("unknown register size"); 487 } 488 } 489 490 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 491 switch (Size) { 492 case 4: 493 return AMDGPU::SI_SPILL_V32_SAVE; 494 case 8: 495 return AMDGPU::SI_SPILL_V64_SAVE; 496 case 12: 497 return AMDGPU::SI_SPILL_V96_SAVE; 498 case 16: 499 return AMDGPU::SI_SPILL_V128_SAVE; 500 case 32: 501 return AMDGPU::SI_SPILL_V256_SAVE; 502 case 64: 503 return AMDGPU::SI_SPILL_V512_SAVE; 504 default: 505 llvm_unreachable("unknown register size"); 506 } 507 } 508 509 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 510 MachineBasicBlock::iterator MI, 511 unsigned SrcReg, bool isKill, 512 int FrameIndex, 513 const TargetRegisterClass *RC, 514 const TargetRegisterInfo *TRI) const { 515 MachineFunction *MF = MBB.getParent(); 516 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 517 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 518 DebugLoc DL = MBB.findDebugLoc(MI); 519 520 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 521 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 522 MachinePointerInfo PtrInfo 523 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 524 MachineMemOperand *MMO 525 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 526 Size, Align); 527 528 if (RI.isSGPRClass(RC)) { 529 MFI->setHasSpilledSGPRs(); 530 531 // We are only allowed to create one new instruction when spilling 532 // registers, so we need to use pseudo instruction for spilling SGPRs. 533 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); 534 535 // The SGPR spill/restore instructions only work on number sgprs, so we need 536 // to make sure we are using the correct register class. 537 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { 538 MachineRegisterInfo &MRI = MF->getRegInfo(); 539 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 540 } 541 542 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 543 .addReg(SrcReg, getKillRegState(isKill)) // data 544 .addFrameIndex(FrameIndex) // addr 545 .addMemOperand(MMO) 546 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 547 .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); 548 // Add the scratch resource registers as implicit uses because we may end up 549 // needing them, and need to ensure that the reserved registers are 550 // correctly handled. 551 552 if (ST.hasScalarStores()) { 553 // m0 is used for offset to scalar stores if used to spill. 554 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 555 } 556 557 return; 558 } 559 560 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 561 LLVMContext &Ctx = MF->getFunction()->getContext(); 562 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 563 " spill register"); 564 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 565 .addReg(SrcReg); 566 567 return; 568 } 569 570 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 571 572 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 573 MFI->setHasSpilledVGPRs(); 574 BuildMI(MBB, MI, DL, get(Opcode)) 575 .addReg(SrcReg, getKillRegState(isKill)) // data 576 .addFrameIndex(FrameIndex) // addr 577 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 578 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 579 .addImm(0) // offset 580 .addMemOperand(MMO); 581 } 582 583 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 584 switch (Size) { 585 case 4: 586 return AMDGPU::SI_SPILL_S32_RESTORE; 587 case 8: 588 return AMDGPU::SI_SPILL_S64_RESTORE; 589 case 16: 590 return AMDGPU::SI_SPILL_S128_RESTORE; 591 case 32: 592 return AMDGPU::SI_SPILL_S256_RESTORE; 593 case 64: 594 return AMDGPU::SI_SPILL_S512_RESTORE; 595 default: 596 llvm_unreachable("unknown register size"); 597 } 598 } 599 600 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 601 switch (Size) { 602 case 4: 603 return AMDGPU::SI_SPILL_V32_RESTORE; 604 case 8: 605 return AMDGPU::SI_SPILL_V64_RESTORE; 606 case 12: 607 return AMDGPU::SI_SPILL_V96_RESTORE; 608 case 16: 609 return AMDGPU::SI_SPILL_V128_RESTORE; 610 case 32: 611 return AMDGPU::SI_SPILL_V256_RESTORE; 612 case 64: 613 return AMDGPU::SI_SPILL_V512_RESTORE; 614 default: 615 llvm_unreachable("unknown register size"); 616 } 617 } 618 619 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 620 MachineBasicBlock::iterator MI, 621 unsigned DestReg, int FrameIndex, 622 const TargetRegisterClass *RC, 623 const TargetRegisterInfo *TRI) const { 624 MachineFunction *MF = MBB.getParent(); 625 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 626 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 627 DebugLoc DL = MBB.findDebugLoc(MI); 628 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 629 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 630 631 MachinePointerInfo PtrInfo 632 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 633 634 MachineMemOperand *MMO = MF->getMachineMemOperand( 635 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 636 637 if (RI.isSGPRClass(RC)) { 638 // FIXME: Maybe this should not include a memoperand because it will be 639 // lowered to non-memory instructions. 640 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); 641 if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { 642 MachineRegisterInfo &MRI = MF->getRegInfo(); 643 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 644 } 645 646 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 647 .addFrameIndex(FrameIndex) // addr 648 .addMemOperand(MMO) 649 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 650 .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); 651 652 if (ST.hasScalarStores()) { 653 // m0 is used for offset to scalar stores if used to spill. 654 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 655 } 656 657 return; 658 } 659 660 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 661 LLVMContext &Ctx = MF->getFunction()->getContext(); 662 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 663 " restore register"); 664 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 665 666 return; 667 } 668 669 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 670 671 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 672 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 673 .addFrameIndex(FrameIndex) // vaddr 674 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 675 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 676 .addImm(0) // offset 677 .addMemOperand(MMO); 678 } 679 680 /// \param @Offset Offset in bytes of the FrameIndex being spilled 681 unsigned SIInstrInfo::calculateLDSSpillAddress( 682 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 683 unsigned FrameOffset, unsigned Size) const { 684 MachineFunction *MF = MBB.getParent(); 685 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 686 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 687 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 688 DebugLoc DL = MBB.findDebugLoc(MI); 689 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 690 unsigned WavefrontSize = ST.getWavefrontSize(); 691 692 unsigned TIDReg = MFI->getTIDReg(); 693 if (!MFI->hasCalculatedTID()) { 694 MachineBasicBlock &Entry = MBB.getParent()->front(); 695 MachineBasicBlock::iterator Insert = Entry.front(); 696 DebugLoc DL = Insert->getDebugLoc(); 697 698 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 699 *MF); 700 if (TIDReg == AMDGPU::NoRegister) 701 return TIDReg; 702 703 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 704 WorkGroupSize > WavefrontSize) { 705 706 unsigned TIDIGXReg 707 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 708 unsigned TIDIGYReg 709 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 710 unsigned TIDIGZReg 711 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 712 unsigned InputPtrReg = 713 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 714 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 715 if (!Entry.isLiveIn(Reg)) 716 Entry.addLiveIn(Reg); 717 } 718 719 RS->enterBasicBlock(Entry); 720 // FIXME: Can we scavenge an SReg_64 and access the subregs? 721 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 722 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 723 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 724 .addReg(InputPtrReg) 725 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 726 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 727 .addReg(InputPtrReg) 728 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 729 730 // NGROUPS.X * NGROUPS.Y 731 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 732 .addReg(STmp1) 733 .addReg(STmp0); 734 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 735 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 736 .addReg(STmp1) 737 .addReg(TIDIGXReg); 738 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 739 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 740 .addReg(STmp0) 741 .addReg(TIDIGYReg) 742 .addReg(TIDReg); 743 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 744 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 745 .addReg(TIDReg) 746 .addReg(TIDIGZReg); 747 } else { 748 // Get the wave id 749 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 750 TIDReg) 751 .addImm(-1) 752 .addImm(0); 753 754 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 755 TIDReg) 756 .addImm(-1) 757 .addReg(TIDReg); 758 } 759 760 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 761 TIDReg) 762 .addImm(2) 763 .addReg(TIDReg); 764 MFI->setTIDReg(TIDReg); 765 } 766 767 // Add FrameIndex to LDS offset 768 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 769 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 770 .addImm(LDSOffset) 771 .addReg(TIDReg); 772 773 return TmpReg; 774 } 775 776 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 777 MachineBasicBlock::iterator MI, 778 int Count) const { 779 DebugLoc DL = MBB.findDebugLoc(MI); 780 while (Count > 0) { 781 int Arg; 782 if (Count >= 8) 783 Arg = 7; 784 else 785 Arg = Count - 1; 786 Count -= 8; 787 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 788 .addImm(Arg); 789 } 790 } 791 792 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 793 MachineBasicBlock::iterator MI) const { 794 insertWaitStates(MBB, MI, 1); 795 } 796 797 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 798 switch (MI.getOpcode()) { 799 default: return 1; // FIXME: Do wait states equal cycles? 800 801 case AMDGPU::S_NOP: 802 return MI.getOperand(0).getImm() + 1; 803 } 804 } 805 806 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 807 MachineBasicBlock &MBB = *MI.getParent(); 808 DebugLoc DL = MBB.findDebugLoc(MI); 809 switch (MI.getOpcode()) { 810 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 811 case AMDGPU::S_MOV_B64_term: { 812 // This is only a terminator to get the correct spill code placement during 813 // register allocation. 814 MI.setDesc(get(AMDGPU::S_MOV_B64)); 815 break; 816 } 817 case AMDGPU::S_XOR_B64_term: { 818 // This is only a terminator to get the correct spill code placement during 819 // register allocation. 820 MI.setDesc(get(AMDGPU::S_XOR_B64)); 821 break; 822 } 823 case AMDGPU::S_ANDN2_B64_term: { 824 // This is only a terminator to get the correct spill code placement during 825 // register allocation. 826 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 827 break; 828 } 829 case AMDGPU::V_MOV_B64_PSEUDO: { 830 unsigned Dst = MI.getOperand(0).getReg(); 831 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 832 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 833 834 const MachineOperand &SrcOp = MI.getOperand(1); 835 // FIXME: Will this work for 64-bit floating point immediates? 836 assert(!SrcOp.isFPImm()); 837 if (SrcOp.isImm()) { 838 APInt Imm(64, SrcOp.getImm()); 839 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 840 .addImm(Imm.getLoBits(32).getZExtValue()) 841 .addReg(Dst, RegState::Implicit | RegState::Define); 842 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 843 .addImm(Imm.getHiBits(32).getZExtValue()) 844 .addReg(Dst, RegState::Implicit | RegState::Define); 845 } else { 846 assert(SrcOp.isReg()); 847 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 848 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 849 .addReg(Dst, RegState::Implicit | RegState::Define); 850 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 851 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 852 .addReg(Dst, RegState::Implicit | RegState::Define); 853 } 854 MI.eraseFromParent(); 855 break; 856 } 857 case AMDGPU::V_MOVRELD_B32_V1: 858 case AMDGPU::V_MOVRELD_B32_V2: 859 case AMDGPU::V_MOVRELD_B32_V4: 860 case AMDGPU::V_MOVRELD_B32_V8: 861 case AMDGPU::V_MOVRELD_B32_V16: { 862 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 863 unsigned VecReg = MI.getOperand(0).getReg(); 864 bool IsUndef = MI.getOperand(1).isUndef(); 865 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 866 assert(VecReg == MI.getOperand(1).getReg()); 867 868 MachineInstr *MovRel = 869 BuildMI(MBB, MI, DL, MovRelDesc) 870 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 871 .addOperand(MI.getOperand(2)) 872 .addReg(VecReg, RegState::ImplicitDefine) 873 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 874 875 const int ImpDefIdx = 876 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 877 const int ImpUseIdx = ImpDefIdx + 1; 878 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 879 880 MI.eraseFromParent(); 881 break; 882 } 883 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 884 MachineFunction &MF = *MBB.getParent(); 885 unsigned Reg = MI.getOperand(0).getReg(); 886 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 887 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 888 889 // Create a bundle so these instructions won't be re-ordered by the 890 // post-RA scheduler. 891 MIBundleBuilder Bundler(MBB, MI); 892 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 893 894 // Add 32-bit offset from this instruction to the start of the 895 // constant data. 896 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 897 .addReg(RegLo) 898 .addOperand(MI.getOperand(1))); 899 900 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 901 .addReg(RegHi); 902 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 903 MIB.addImm(0); 904 else 905 MIB.addOperand(MI.getOperand(2)); 906 907 Bundler.append(MIB); 908 llvm::finalizeBundle(MBB, Bundler.begin()); 909 910 MI.eraseFromParent(); 911 break; 912 } 913 } 914 return true; 915 } 916 917 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 918 MachineOperand &Src0, 919 unsigned Src0OpName, 920 MachineOperand &Src1, 921 unsigned Src1OpName) const { 922 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 923 if (!Src0Mods) 924 return false; 925 926 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 927 assert(Src1Mods && 928 "All commutable instructions have both src0 and src1 modifiers"); 929 930 int Src0ModsVal = Src0Mods->getImm(); 931 int Src1ModsVal = Src1Mods->getImm(); 932 933 Src1Mods->setImm(Src0ModsVal); 934 Src0Mods->setImm(Src1ModsVal); 935 return true; 936 } 937 938 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 939 MachineOperand &RegOp, 940 MachineOperand &NonRegOp) { 941 unsigned Reg = RegOp.getReg(); 942 unsigned SubReg = RegOp.getSubReg(); 943 bool IsKill = RegOp.isKill(); 944 bool IsDead = RegOp.isDead(); 945 bool IsUndef = RegOp.isUndef(); 946 bool IsDebug = RegOp.isDebug(); 947 948 if (NonRegOp.isImm()) 949 RegOp.ChangeToImmediate(NonRegOp.getImm()); 950 else if (NonRegOp.isFI()) 951 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 952 else 953 return nullptr; 954 955 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 956 NonRegOp.setSubReg(SubReg); 957 958 return &MI; 959 } 960 961 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 962 unsigned Src0Idx, 963 unsigned Src1Idx) const { 964 assert(!NewMI && "this should never be used"); 965 966 unsigned Opc = MI.getOpcode(); 967 int CommutedOpcode = commuteOpcode(Opc); 968 if (CommutedOpcode == -1) 969 return nullptr; 970 971 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 972 static_cast<int>(Src0Idx) && 973 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 974 static_cast<int>(Src1Idx) && 975 "inconsistency with findCommutedOpIndices"); 976 977 MachineOperand &Src0 = MI.getOperand(Src0Idx); 978 MachineOperand &Src1 = MI.getOperand(Src1Idx); 979 980 MachineInstr *CommutedMI = nullptr; 981 if (Src0.isReg() && Src1.isReg()) { 982 if (isOperandLegal(MI, Src1Idx, &Src0)) { 983 // Be sure to copy the source modifiers to the right place. 984 CommutedMI 985 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 986 } 987 988 } else if (Src0.isReg() && !Src1.isReg()) { 989 // src0 should always be able to support any operand type, so no need to 990 // check operand legality. 991 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 992 } else if (!Src0.isReg() && Src1.isReg()) { 993 if (isOperandLegal(MI, Src1Idx, &Src0)) 994 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 995 } else { 996 // FIXME: Found two non registers to commute. This does happen. 997 return nullptr; 998 } 999 1000 1001 if (CommutedMI) { 1002 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1003 Src1, AMDGPU::OpName::src1_modifiers); 1004 1005 CommutedMI->setDesc(get(CommutedOpcode)); 1006 } 1007 1008 return CommutedMI; 1009 } 1010 1011 // This needs to be implemented because the source modifiers may be inserted 1012 // between the true commutable operands, and the base 1013 // TargetInstrInfo::commuteInstruction uses it. 1014 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1015 unsigned &SrcOpIdx1) const { 1016 if (!MI.isCommutable()) 1017 return false; 1018 1019 unsigned Opc = MI.getOpcode(); 1020 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1021 if (Src0Idx == -1) 1022 return false; 1023 1024 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1025 if (Src1Idx == -1) 1026 return false; 1027 1028 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1029 } 1030 1031 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1032 int64_t BrOffset) const { 1033 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1034 // block is unanalyzable. 1035 assert(BranchOp != AMDGPU::S_SETPC_B64); 1036 1037 // Convert to dwords. 1038 BrOffset /= 4; 1039 1040 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1041 // from the next instruction. 1042 BrOffset -= 1; 1043 1044 return isIntN(BranchOffsetBits, BrOffset); 1045 } 1046 1047 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1048 const MachineInstr &MI) const { 1049 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1050 // This would be a difficult analysis to perform, but can always be legal so 1051 // there's no need to analyze it. 1052 return nullptr; 1053 } 1054 1055 return MI.getOperand(0).getMBB(); 1056 } 1057 1058 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1059 MachineBasicBlock &DestBB, 1060 const DebugLoc &DL, 1061 int64_t BrOffset, 1062 RegScavenger *RS) const { 1063 assert(RS && "RegScavenger required for long branching"); 1064 assert(MBB.empty() && 1065 "new block should be inserted for expanding unconditional branch"); 1066 assert(MBB.pred_size() == 1); 1067 1068 MachineFunction *MF = MBB.getParent(); 1069 MachineRegisterInfo &MRI = MF->getRegInfo(); 1070 1071 // FIXME: Virtual register workaround for RegScavenger not working with empty 1072 // blocks. 1073 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1074 1075 auto I = MBB.end(); 1076 1077 // We need to compute the offset relative to the instruction immediately after 1078 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1079 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1080 1081 // TODO: Handle > 32-bit block address. 1082 if (BrOffset >= 0) { 1083 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1084 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1085 .addReg(PCReg, 0, AMDGPU::sub0) 1086 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1087 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1088 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1089 .addReg(PCReg, 0, AMDGPU::sub1) 1090 .addImm(0); 1091 } else { 1092 // Backwards branch. 1093 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1094 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1095 .addReg(PCReg, 0, AMDGPU::sub0) 1096 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1097 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1098 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1099 .addReg(PCReg, 0, AMDGPU::sub1) 1100 .addImm(0); 1101 } 1102 1103 // Insert the indirect branch after the other terminator. 1104 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1105 .addReg(PCReg); 1106 1107 // FIXME: If spilling is necessary, this will fail because this scavenger has 1108 // no emergency stack slots. It is non-trivial to spill in this situation, 1109 // because the restore code needs to be specially placed after the 1110 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1111 // block. 1112 // 1113 // If a spill is needed for the pc register pair, we need to insert a spill 1114 // restore block right before the destination block, and insert a short branch 1115 // into the old destination block's fallthrough predecessor. 1116 // e.g.: 1117 // 1118 // s_cbranch_scc0 skip_long_branch: 1119 // 1120 // long_branch_bb: 1121 // spill s[8:9] 1122 // s_getpc_b64 s[8:9] 1123 // s_add_u32 s8, s8, restore_bb 1124 // s_addc_u32 s9, s9, 0 1125 // s_setpc_b64 s[8:9] 1126 // 1127 // skip_long_branch: 1128 // foo; 1129 // 1130 // ..... 1131 // 1132 // dest_bb_fallthrough_predecessor: 1133 // bar; 1134 // s_branch dest_bb 1135 // 1136 // restore_bb: 1137 // restore s[8:9] 1138 // fallthrough dest_bb 1139 /// 1140 // dest_bb: 1141 // buzz; 1142 1143 RS->enterBasicBlockEnd(MBB); 1144 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1145 MachineBasicBlock::iterator(GetPC), 0); 1146 MRI.replaceRegWith(PCReg, Scav); 1147 MRI.clearVirtRegs(); 1148 RS->setRegUsed(Scav); 1149 1150 return 4 + 8 + 4 + 4; 1151 } 1152 1153 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1154 switch (Cond) { 1155 case SIInstrInfo::SCC_TRUE: 1156 return AMDGPU::S_CBRANCH_SCC1; 1157 case SIInstrInfo::SCC_FALSE: 1158 return AMDGPU::S_CBRANCH_SCC0; 1159 case SIInstrInfo::VCCNZ: 1160 return AMDGPU::S_CBRANCH_VCCNZ; 1161 case SIInstrInfo::VCCZ: 1162 return AMDGPU::S_CBRANCH_VCCZ; 1163 case SIInstrInfo::EXECNZ: 1164 return AMDGPU::S_CBRANCH_EXECNZ; 1165 case SIInstrInfo::EXECZ: 1166 return AMDGPU::S_CBRANCH_EXECZ; 1167 default: 1168 llvm_unreachable("invalid branch predicate"); 1169 } 1170 } 1171 1172 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1173 switch (Opcode) { 1174 case AMDGPU::S_CBRANCH_SCC0: 1175 return SCC_FALSE; 1176 case AMDGPU::S_CBRANCH_SCC1: 1177 return SCC_TRUE; 1178 case AMDGPU::S_CBRANCH_VCCNZ: 1179 return VCCNZ; 1180 case AMDGPU::S_CBRANCH_VCCZ: 1181 return VCCZ; 1182 case AMDGPU::S_CBRANCH_EXECNZ: 1183 return EXECNZ; 1184 case AMDGPU::S_CBRANCH_EXECZ: 1185 return EXECZ; 1186 default: 1187 return INVALID_BR; 1188 } 1189 } 1190 1191 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1192 MachineBasicBlock::iterator I, 1193 MachineBasicBlock *&TBB, 1194 MachineBasicBlock *&FBB, 1195 SmallVectorImpl<MachineOperand> &Cond, 1196 bool AllowModify) const { 1197 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1198 // Unconditional Branch 1199 TBB = I->getOperand(0).getMBB(); 1200 return false; 1201 } 1202 1203 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1204 if (Pred == INVALID_BR) 1205 return true; 1206 1207 MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); 1208 Cond.push_back(MachineOperand::CreateImm(Pred)); 1209 Cond.push_back(I->getOperand(1)); // Save the branch register. 1210 1211 ++I; 1212 1213 if (I == MBB.end()) { 1214 // Conditional branch followed by fall-through. 1215 TBB = CondBB; 1216 return false; 1217 } 1218 1219 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1220 TBB = CondBB; 1221 FBB = I->getOperand(0).getMBB(); 1222 return false; 1223 } 1224 1225 return true; 1226 } 1227 1228 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1229 MachineBasicBlock *&FBB, 1230 SmallVectorImpl<MachineOperand> &Cond, 1231 bool AllowModify) const { 1232 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1233 if (I == MBB.end()) 1234 return false; 1235 1236 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1237 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1238 1239 ++I; 1240 1241 // TODO: Should be able to treat as fallthrough? 1242 if (I == MBB.end()) 1243 return true; 1244 1245 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1246 return true; 1247 1248 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1249 1250 // Specifically handle the case where the conditional branch is to the same 1251 // destination as the mask branch. e.g. 1252 // 1253 // si_mask_branch BB8 1254 // s_cbranch_execz BB8 1255 // s_cbranch BB9 1256 // 1257 // This is required to understand divergent loops which may need the branches 1258 // to be relaxed. 1259 if (TBB != MaskBrDest || Cond.empty()) 1260 return true; 1261 1262 auto Pred = Cond[0].getImm(); 1263 return (Pred != EXECZ && Pred != EXECNZ); 1264 } 1265 1266 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1267 int *BytesRemoved) const { 1268 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1269 1270 unsigned Count = 0; 1271 unsigned RemovedSize = 0; 1272 while (I != MBB.end()) { 1273 MachineBasicBlock::iterator Next = std::next(I); 1274 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1275 I = Next; 1276 continue; 1277 } 1278 1279 RemovedSize += getInstSizeInBytes(*I); 1280 I->eraseFromParent(); 1281 ++Count; 1282 I = Next; 1283 } 1284 1285 if (BytesRemoved) 1286 *BytesRemoved = RemovedSize; 1287 1288 return Count; 1289 } 1290 1291 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1292 MachineBasicBlock *TBB, 1293 MachineBasicBlock *FBB, 1294 ArrayRef<MachineOperand> Cond, 1295 const DebugLoc &DL, 1296 int *BytesAdded) const { 1297 1298 if (!FBB && Cond.empty()) { 1299 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1300 .addMBB(TBB); 1301 if (BytesAdded) 1302 *BytesAdded = 4; 1303 return 1; 1304 } 1305 1306 assert(TBB && Cond[0].isImm()); 1307 1308 unsigned Opcode 1309 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1310 1311 if (!FBB) { 1312 Cond[1].isUndef(); 1313 MachineInstr *CondBr = 1314 BuildMI(&MBB, DL, get(Opcode)) 1315 .addMBB(TBB); 1316 1317 // Copy the flags onto the implicit condition register operand. 1318 MachineOperand &CondReg = CondBr->getOperand(1); 1319 CondReg.setIsUndef(Cond[1].isUndef()); 1320 CondReg.setIsKill(Cond[1].isKill()); 1321 1322 if (BytesAdded) 1323 *BytesAdded = 4; 1324 return 1; 1325 } 1326 1327 assert(TBB && FBB); 1328 1329 MachineInstr *CondBr = 1330 BuildMI(&MBB, DL, get(Opcode)) 1331 .addMBB(TBB); 1332 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1333 .addMBB(FBB); 1334 1335 MachineOperand &CondReg = CondBr->getOperand(1); 1336 CondReg.setIsUndef(Cond[1].isUndef()); 1337 CondReg.setIsKill(Cond[1].isKill()); 1338 1339 if (BytesAdded) 1340 *BytesAdded = 8; 1341 1342 return 2; 1343 } 1344 1345 bool SIInstrInfo::reverseBranchCondition( 1346 SmallVectorImpl<MachineOperand> &Cond) const { 1347 assert(Cond.size() == 2); 1348 Cond[0].setImm(-Cond[0].getImm()); 1349 return false; 1350 } 1351 1352 static void removeModOperands(MachineInstr &MI) { 1353 unsigned Opc = MI.getOpcode(); 1354 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1355 AMDGPU::OpName::src0_modifiers); 1356 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1357 AMDGPU::OpName::src1_modifiers); 1358 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1359 AMDGPU::OpName::src2_modifiers); 1360 1361 MI.RemoveOperand(Src2ModIdx); 1362 MI.RemoveOperand(Src1ModIdx); 1363 MI.RemoveOperand(Src0ModIdx); 1364 } 1365 1366 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1367 unsigned Reg, MachineRegisterInfo *MRI) const { 1368 if (!MRI->hasOneNonDBGUse(Reg)) 1369 return false; 1370 1371 unsigned Opc = UseMI.getOpcode(); 1372 if (Opc == AMDGPU::COPY) { 1373 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1374 switch (DefMI.getOpcode()) { 1375 default: 1376 return false; 1377 case AMDGPU::S_MOV_B64: 1378 // TODO: We could fold 64-bit immediates, but this get compilicated 1379 // when there are sub-registers. 1380 return false; 1381 1382 case AMDGPU::V_MOV_B32_e32: 1383 case AMDGPU::S_MOV_B32: 1384 break; 1385 } 1386 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1387 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1388 assert(ImmOp); 1389 // FIXME: We could handle FrameIndex values here. 1390 if (!ImmOp->isImm()) { 1391 return false; 1392 } 1393 UseMI.setDesc(get(NewOpc)); 1394 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1395 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1396 return true; 1397 } 1398 1399 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 1400 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 1401 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 1402 1403 // Don't fold if we are using source modifiers. The new VOP2 instructions 1404 // don't have them. 1405 if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || 1406 hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || 1407 hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { 1408 return false; 1409 } 1410 1411 const MachineOperand &ImmOp = DefMI.getOperand(1); 1412 1413 // If this is a free constant, there's no reason to do this. 1414 // TODO: We could fold this here instead of letting SIFoldOperands do it 1415 // later. 1416 if (isInlineConstant(ImmOp, 4)) 1417 return false; 1418 1419 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1420 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1421 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1422 1423 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 1424 // We should only expect these to be on src0 due to canonicalizations. 1425 if (Src0->isReg() && Src0->getReg() == Reg) { 1426 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1427 return false; 1428 1429 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1430 return false; 1431 1432 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1433 1434 const int64_t Imm = DefMI.getOperand(1).getImm(); 1435 1436 // FIXME: This would be a lot easier if we could return a new instruction 1437 // instead of having to modify in place. 1438 1439 // Remove these first since they are at the end. 1440 UseMI.RemoveOperand( 1441 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1442 UseMI.RemoveOperand( 1443 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1444 1445 unsigned Src1Reg = Src1->getReg(); 1446 unsigned Src1SubReg = Src1->getSubReg(); 1447 Src0->setReg(Src1Reg); 1448 Src0->setSubReg(Src1SubReg); 1449 Src0->setIsKill(Src1->isKill()); 1450 1451 if (Opc == AMDGPU::V_MAC_F32_e64 || 1452 Opc == AMDGPU::V_MAC_F16_e64) 1453 UseMI.untieRegOperand( 1454 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1455 1456 Src1->ChangeToImmediate(Imm); 1457 1458 removeModOperands(UseMI); 1459 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 1460 1461 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1462 if (DeleteDef) 1463 DefMI.eraseFromParent(); 1464 1465 return true; 1466 } 1467 1468 // Added part is the constant: Use v_madak_{f16, f32}. 1469 if (Src2->isReg() && Src2->getReg() == Reg) { 1470 // Not allowed to use constant bus for another operand. 1471 // We can however allow an inline immediate as src0. 1472 if (!Src0->isImm() && 1473 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1474 return false; 1475 1476 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1477 return false; 1478 1479 const int64_t Imm = DefMI.getOperand(1).getImm(); 1480 1481 // FIXME: This would be a lot easier if we could return a new instruction 1482 // instead of having to modify in place. 1483 1484 // Remove these first since they are at the end. 1485 UseMI.RemoveOperand( 1486 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1487 UseMI.RemoveOperand( 1488 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1489 1490 if (Opc == AMDGPU::V_MAC_F32_e64 || 1491 Opc == AMDGPU::V_MAC_F16_e64) 1492 UseMI.untieRegOperand( 1493 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1494 1495 // ChangingToImmediate adds Src2 back to the instruction. 1496 Src2->ChangeToImmediate(Imm); 1497 1498 // These come before src2. 1499 removeModOperands(UseMI); 1500 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 1501 1502 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1503 if (DeleteDef) 1504 DefMI.eraseFromParent(); 1505 1506 return true; 1507 } 1508 } 1509 1510 return false; 1511 } 1512 1513 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1514 int WidthB, int OffsetB) { 1515 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1516 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1517 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1518 return LowOffset + LowWidth <= HighOffset; 1519 } 1520 1521 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 1522 MachineInstr &MIb) const { 1523 unsigned BaseReg0, BaseReg1; 1524 int64_t Offset0, Offset1; 1525 1526 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1527 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1528 1529 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 1530 // FIXME: Handle ds_read2 / ds_write2. 1531 return false; 1532 } 1533 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 1534 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 1535 if (BaseReg0 == BaseReg1 && 1536 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1537 return true; 1538 } 1539 } 1540 1541 return false; 1542 } 1543 1544 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 1545 MachineInstr &MIb, 1546 AliasAnalysis *AA) const { 1547 assert((MIa.mayLoad() || MIa.mayStore()) && 1548 "MIa must load from or modify a memory location"); 1549 assert((MIb.mayLoad() || MIb.mayStore()) && 1550 "MIb must load from or modify a memory location"); 1551 1552 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 1553 return false; 1554 1555 // XXX - Can we relax this between address spaces? 1556 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1557 return false; 1558 1559 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 1560 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 1561 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 1562 if (MMOa->getValue() && MMOb->getValue()) { 1563 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 1564 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 1565 if (!AA->alias(LocA, LocB)) 1566 return true; 1567 } 1568 } 1569 1570 // TODO: Should we check the address space from the MachineMemOperand? That 1571 // would allow us to distinguish objects we know don't alias based on the 1572 // underlying address space, even if it was lowered to a different one, 1573 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1574 // buffer. 1575 if (isDS(MIa)) { 1576 if (isDS(MIb)) 1577 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1578 1579 return !isFLAT(MIb); 1580 } 1581 1582 if (isMUBUF(MIa) || isMTBUF(MIa)) { 1583 if (isMUBUF(MIb) || isMTBUF(MIb)) 1584 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1585 1586 return !isFLAT(MIb) && !isSMRD(MIb); 1587 } 1588 1589 if (isSMRD(MIa)) { 1590 if (isSMRD(MIb)) 1591 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1592 1593 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 1594 } 1595 1596 if (isFLAT(MIa)) { 1597 if (isFLAT(MIb)) 1598 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1599 1600 return false; 1601 } 1602 1603 return false; 1604 } 1605 1606 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1607 MachineInstr &MI, 1608 LiveVariables *LV) const { 1609 bool IsF16 = false; 1610 1611 switch (MI.getOpcode()) { 1612 default: 1613 return nullptr; 1614 case AMDGPU::V_MAC_F16_e64: 1615 IsF16 = true; 1616 case AMDGPU::V_MAC_F32_e64: 1617 break; 1618 case AMDGPU::V_MAC_F16_e32: 1619 IsF16 = true; 1620 case AMDGPU::V_MAC_F32_e32: { 1621 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1622 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1623 return nullptr; 1624 break; 1625 } 1626 } 1627 1628 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 1629 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1630 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 1631 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 1632 1633 return BuildMI(*MBB, MI, MI.getDebugLoc(), 1634 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) 1635 .addOperand(*Dst) 1636 .addImm(0) // Src0 mods 1637 .addOperand(*Src0) 1638 .addImm(0) // Src1 mods 1639 .addOperand(*Src1) 1640 .addImm(0) // Src mods 1641 .addOperand(*Src2) 1642 .addImm(0) // clamp 1643 .addImm(0); // omod 1644 } 1645 1646 // It's not generally safe to move VALU instructions across these since it will 1647 // start using the register as a base index rather than directly. 1648 // XXX - Why isn't hasSideEffects sufficient for these? 1649 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 1650 switch (MI.getOpcode()) { 1651 case AMDGPU::S_SET_GPR_IDX_ON: 1652 case AMDGPU::S_SET_GPR_IDX_MODE: 1653 case AMDGPU::S_SET_GPR_IDX_OFF: 1654 return true; 1655 default: 1656 return false; 1657 } 1658 } 1659 1660 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1661 const MachineBasicBlock *MBB, 1662 const MachineFunction &MF) const { 1663 // XXX - Do we want the SP check in the base implementation? 1664 1665 // Target-independent instructions do not have an implicit-use of EXEC, even 1666 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1667 // boundaries prevents incorrect movements of such instructions. 1668 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 1669 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 1670 changesVGPRIndexingMode(MI); 1671 } 1672 1673 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1674 int64_t SVal = Imm.getSExtValue(); 1675 if (SVal >= -16 && SVal <= 64) 1676 return true; 1677 1678 if (Imm.getBitWidth() == 64) { 1679 uint64_t Val = Imm.getZExtValue(); 1680 return (DoubleToBits(0.0) == Val) || 1681 (DoubleToBits(1.0) == Val) || 1682 (DoubleToBits(-1.0) == Val) || 1683 (DoubleToBits(0.5) == Val) || 1684 (DoubleToBits(-0.5) == Val) || 1685 (DoubleToBits(2.0) == Val) || 1686 (DoubleToBits(-2.0) == Val) || 1687 (DoubleToBits(4.0) == Val) || 1688 (DoubleToBits(-4.0) == Val) || 1689 (ST.hasInv2PiInlineImm() && Val == 0x3fc45f306dc9c882); 1690 } 1691 1692 // The actual type of the operand does not seem to matter as long 1693 // as the bits match one of the inline immediate values. For example: 1694 // 1695 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1696 // so it is a legal inline immediate. 1697 // 1698 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1699 // floating-point, so it is a legal inline immediate. 1700 uint32_t Val = Imm.getZExtValue(); 1701 1702 return (FloatToBits(0.0f) == Val) || 1703 (FloatToBits(1.0f) == Val) || 1704 (FloatToBits(-1.0f) == Val) || 1705 (FloatToBits(0.5f) == Val) || 1706 (FloatToBits(-0.5f) == Val) || 1707 (FloatToBits(2.0f) == Val) || 1708 (FloatToBits(-2.0f) == Val) || 1709 (FloatToBits(4.0f) == Val) || 1710 (FloatToBits(-4.0f) == Val) || 1711 (ST.hasInv2PiInlineImm() && Val == 0x3e22f983); 1712 } 1713 1714 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1715 unsigned OpSize) const { 1716 if (MO.isImm()) { 1717 // MachineOperand provides no way to tell the true operand size, since it 1718 // only records a 64-bit value. We need to know the size to determine if a 1719 // 32-bit floating point immediate bit pattern is legal for an integer 1720 // immediate. It would be for any 32-bit integer operand, but would not be 1721 // for a 64-bit one. 1722 1723 unsigned BitSize = 8 * OpSize; 1724 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1725 } 1726 1727 return false; 1728 } 1729 1730 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1731 unsigned OpSize) const { 1732 return MO.isImm() && !isInlineConstant(MO, OpSize); 1733 } 1734 1735 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 1736 unsigned OpSize) const { 1737 switch (MO.getType()) { 1738 case MachineOperand::MO_Register: 1739 return false; 1740 case MachineOperand::MO_Immediate: 1741 return !isInlineConstant(MO, OpSize); 1742 case MachineOperand::MO_FrameIndex: 1743 case MachineOperand::MO_MachineBasicBlock: 1744 case MachineOperand::MO_ExternalSymbol: 1745 case MachineOperand::MO_GlobalAddress: 1746 case MachineOperand::MO_MCSymbol: 1747 return true; 1748 default: 1749 llvm_unreachable("unexpected operand type"); 1750 } 1751 } 1752 1753 static bool compareMachineOp(const MachineOperand &Op0, 1754 const MachineOperand &Op1) { 1755 if (Op0.getType() != Op1.getType()) 1756 return false; 1757 1758 switch (Op0.getType()) { 1759 case MachineOperand::MO_Register: 1760 return Op0.getReg() == Op1.getReg(); 1761 case MachineOperand::MO_Immediate: 1762 return Op0.getImm() == Op1.getImm(); 1763 default: 1764 llvm_unreachable("Didn't expect to be comparing these operand types"); 1765 } 1766 } 1767 1768 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 1769 const MachineOperand &MO) const { 1770 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 1771 1772 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1773 1774 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1775 return true; 1776 1777 if (OpInfo.RegClass < 0) 1778 return false; 1779 1780 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1781 if (isLiteralConstant(MO, OpSize)) 1782 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1783 1784 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1785 } 1786 1787 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1788 int Op32 = AMDGPU::getVOPe32(Opcode); 1789 if (Op32 == -1) 1790 return false; 1791 1792 return pseudoToMCOpcode(Op32) != -1; 1793 } 1794 1795 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1796 // The src0_modifier operand is present on all instructions 1797 // that have modifiers. 1798 1799 return AMDGPU::getNamedOperandIdx(Opcode, 1800 AMDGPU::OpName::src0_modifiers) != -1; 1801 } 1802 1803 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1804 unsigned OpName) const { 1805 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1806 return Mods && Mods->getImm(); 1807 } 1808 1809 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1810 const MachineOperand &MO, 1811 unsigned OpSize) const { 1812 // Literal constants use the constant bus. 1813 if (isLiteralConstant(MO, OpSize)) 1814 return true; 1815 1816 if (!MO.isReg() || !MO.isUse()) 1817 return false; 1818 1819 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1820 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1821 1822 // FLAT_SCR is just an SGPR pair. 1823 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1824 return true; 1825 1826 // EXEC register uses the constant bus. 1827 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1828 return true; 1829 1830 // SGPRs use the constant bus 1831 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1832 (!MO.isImplicit() && 1833 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1834 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1835 } 1836 1837 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1838 for (const MachineOperand &MO : MI.implicit_operands()) { 1839 // We only care about reads. 1840 if (MO.isDef()) 1841 continue; 1842 1843 switch (MO.getReg()) { 1844 case AMDGPU::VCC: 1845 case AMDGPU::M0: 1846 case AMDGPU::FLAT_SCR: 1847 return MO.getReg(); 1848 1849 default: 1850 break; 1851 } 1852 } 1853 1854 return AMDGPU::NoRegister; 1855 } 1856 1857 static bool shouldReadExec(const MachineInstr &MI) { 1858 if (SIInstrInfo::isVALU(MI)) { 1859 switch (MI.getOpcode()) { 1860 case AMDGPU::V_READLANE_B32: 1861 case AMDGPU::V_READLANE_B32_si: 1862 case AMDGPU::V_READLANE_B32_vi: 1863 case AMDGPU::V_WRITELANE_B32: 1864 case AMDGPU::V_WRITELANE_B32_si: 1865 case AMDGPU::V_WRITELANE_B32_vi: 1866 return false; 1867 } 1868 1869 return true; 1870 } 1871 1872 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 1873 SIInstrInfo::isSALU(MI) || 1874 SIInstrInfo::isSMRD(MI)) 1875 return false; 1876 1877 return true; 1878 } 1879 1880 static bool isSubRegOf(const SIRegisterInfo &TRI, 1881 const MachineOperand &SuperVec, 1882 const MachineOperand &SubReg) { 1883 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 1884 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 1885 1886 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 1887 SubReg.getReg() == SuperVec.getReg(); 1888 } 1889 1890 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 1891 StringRef &ErrInfo) const { 1892 uint16_t Opcode = MI.getOpcode(); 1893 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1894 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1895 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1896 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1897 1898 // Make sure the number of operands is correct. 1899 const MCInstrDesc &Desc = get(Opcode); 1900 if (!Desc.isVariadic() && 1901 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 1902 ErrInfo = "Instruction has wrong number of operands."; 1903 return false; 1904 } 1905 1906 if (MI.isInlineAsm()) { 1907 // Verify register classes for inlineasm constraints. 1908 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 1909 I != E; ++I) { 1910 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 1911 if (!RC) 1912 continue; 1913 1914 const MachineOperand &Op = MI.getOperand(I); 1915 if (!Op.isReg()) 1916 continue; 1917 1918 unsigned Reg = Op.getReg(); 1919 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 1920 ErrInfo = "inlineasm operand has incorrect register class."; 1921 return false; 1922 } 1923 } 1924 1925 return true; 1926 } 1927 1928 // Make sure the register classes are correct. 1929 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1930 if (MI.getOperand(i).isFPImm()) { 1931 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1932 "all fp values to integers."; 1933 return false; 1934 } 1935 1936 int RegClass = Desc.OpInfo[i].RegClass; 1937 1938 switch (Desc.OpInfo[i].OperandType) { 1939 case MCOI::OPERAND_REGISTER: 1940 if (MI.getOperand(i).isImm()) { 1941 ErrInfo = "Illegal immediate value for operand."; 1942 return false; 1943 } 1944 break; 1945 case AMDGPU::OPERAND_REG_IMM32_INT: 1946 case AMDGPU::OPERAND_REG_IMM32_FP: 1947 break; 1948 case AMDGPU::OPERAND_REG_INLINE_C_INT: 1949 case AMDGPU::OPERAND_REG_INLINE_C_FP: 1950 if (isLiteralConstant(MI.getOperand(i), 1951 RI.getRegClass(RegClass)->getSize())) { 1952 ErrInfo = "Illegal immediate value for operand."; 1953 return false; 1954 } 1955 break; 1956 case MCOI::OPERAND_IMMEDIATE: 1957 case AMDGPU::OPERAND_KIMM32: 1958 // Check if this operand is an immediate. 1959 // FrameIndex operands will be replaced by immediates, so they are 1960 // allowed. 1961 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 1962 ErrInfo = "Expected immediate, but got non-immediate"; 1963 return false; 1964 } 1965 LLVM_FALLTHROUGH; 1966 default: 1967 continue; 1968 } 1969 1970 if (!MI.getOperand(i).isReg()) 1971 continue; 1972 1973 if (RegClass != -1) { 1974 unsigned Reg = MI.getOperand(i).getReg(); 1975 if (Reg == AMDGPU::NoRegister || 1976 TargetRegisterInfo::isVirtualRegister(Reg)) 1977 continue; 1978 1979 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1980 if (!RC->contains(Reg)) { 1981 ErrInfo = "Operand has incorrect register class."; 1982 return false; 1983 } 1984 } 1985 } 1986 1987 // Verify VOP* 1988 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { 1989 // Only look at the true operands. Only a real operand can use the constant 1990 // bus, and we don't want to check pseudo-operands like the source modifier 1991 // flags. 1992 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1993 1994 unsigned ConstantBusCount = 0; 1995 1996 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 1997 ++ConstantBusCount; 1998 1999 unsigned SGPRUsed = findImplicitSGPRRead(MI); 2000 if (SGPRUsed != AMDGPU::NoRegister) 2001 ++ConstantBusCount; 2002 2003 for (int OpIdx : OpIndices) { 2004 if (OpIdx == -1) 2005 break; 2006 const MachineOperand &MO = MI.getOperand(OpIdx); 2007 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 2008 if (MO.isReg()) { 2009 if (MO.getReg() != SGPRUsed) 2010 ++ConstantBusCount; 2011 SGPRUsed = MO.getReg(); 2012 } else { 2013 ++ConstantBusCount; 2014 } 2015 } 2016 } 2017 if (ConstantBusCount > 1) { 2018 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2019 return false; 2020 } 2021 } 2022 2023 // Verify misc. restrictions on specific instructions. 2024 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2025 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2026 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2027 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2028 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2029 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2030 if (!compareMachineOp(Src0, Src1) && 2031 !compareMachineOp(Src0, Src2)) { 2032 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2033 return false; 2034 } 2035 } 2036 } 2037 2038 if (isSOPK(MI)) { 2039 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2040 if (sopkIsZext(MI)) { 2041 if (!isUInt<16>(Imm)) { 2042 ErrInfo = "invalid immediate for SOPK instruction"; 2043 return false; 2044 } 2045 } else { 2046 if (!isInt<16>(Imm)) { 2047 ErrInfo = "invalid immediate for SOPK instruction"; 2048 return false; 2049 } 2050 } 2051 } 2052 2053 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2054 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2055 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2056 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2057 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2058 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2059 2060 const unsigned StaticNumOps = Desc.getNumOperands() + 2061 Desc.getNumImplicitUses(); 2062 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2063 2064 // Allow additional implicit operands. This allows a fixup done by the post 2065 // RA scheduler where the main implicit operand is killed and implicit-defs 2066 // are added for sub-registers that remain live after this instruction. 2067 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 2068 ErrInfo = "missing implicit register operands"; 2069 return false; 2070 } 2071 2072 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2073 if (IsDst) { 2074 if (!Dst->isUse()) { 2075 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2076 return false; 2077 } 2078 2079 unsigned UseOpIdx; 2080 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2081 UseOpIdx != StaticNumOps + 1) { 2082 ErrInfo = "movrel implicit operands should be tied"; 2083 return false; 2084 } 2085 } 2086 2087 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2088 const MachineOperand &ImpUse 2089 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2090 if (!ImpUse.isReg() || !ImpUse.isUse() || 2091 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2092 ErrInfo = "src0 should be subreg of implicit vector use"; 2093 return false; 2094 } 2095 } 2096 2097 // Make sure we aren't losing exec uses in the td files. This mostly requires 2098 // being careful when using let Uses to try to add other use registers. 2099 if (shouldReadExec(MI)) { 2100 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2101 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2102 return false; 2103 } 2104 } 2105 2106 if (isSMRD(MI)) { 2107 if (MI.mayStore()) { 2108 // The register offset form of scalar stores may only use m0 as the 2109 // soffset register. 2110 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 2111 if (Soff && Soff->getReg() != AMDGPU::M0) { 2112 ErrInfo = "scalar stores must use m0 as offset register"; 2113 return false; 2114 } 2115 } 2116 } 2117 2118 return true; 2119 } 2120 2121 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 2122 switch (MI.getOpcode()) { 2123 default: return AMDGPU::INSTRUCTION_LIST_END; 2124 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2125 case AMDGPU::COPY: return AMDGPU::COPY; 2126 case AMDGPU::PHI: return AMDGPU::PHI; 2127 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2128 case AMDGPU::S_MOV_B32: 2129 return MI.getOperand(1).isReg() ? 2130 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2131 case AMDGPU::S_ADD_I32: 2132 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 2133 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 2134 case AMDGPU::S_SUB_I32: 2135 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 2136 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2137 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2138 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2139 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2140 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2141 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2142 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2143 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2144 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2145 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2146 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2147 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2148 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2149 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2150 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2151 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2152 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2153 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2154 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2155 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2156 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2157 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2158 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2159 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2160 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2161 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2162 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2163 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2164 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2165 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2166 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2167 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2168 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2169 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2170 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2171 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2172 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2173 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2174 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2175 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2176 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2177 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2178 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2179 } 2180 } 2181 2182 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 2183 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 2184 } 2185 2186 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2187 unsigned OpNo) const { 2188 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2189 const MCInstrDesc &Desc = get(MI.getOpcode()); 2190 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2191 Desc.OpInfo[OpNo].RegClass == -1) { 2192 unsigned Reg = MI.getOperand(OpNo).getReg(); 2193 2194 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2195 return MRI.getRegClass(Reg); 2196 return RI.getPhysRegClass(Reg); 2197 } 2198 2199 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2200 return RI.getRegClass(RCID); 2201 } 2202 2203 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2204 switch (MI.getOpcode()) { 2205 case AMDGPU::COPY: 2206 case AMDGPU::REG_SEQUENCE: 2207 case AMDGPU::PHI: 2208 case AMDGPU::INSERT_SUBREG: 2209 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2210 default: 2211 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2212 } 2213 } 2214 2215 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2216 MachineBasicBlock::iterator I = MI; 2217 MachineBasicBlock *MBB = MI.getParent(); 2218 MachineOperand &MO = MI.getOperand(OpIdx); 2219 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2220 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2221 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2222 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2223 if (MO.isReg()) 2224 Opcode = AMDGPU::COPY; 2225 else if (RI.isSGPRClass(RC)) 2226 Opcode = AMDGPU::S_MOV_B32; 2227 2228 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 2229 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 2230 VRC = &AMDGPU::VReg_64RegClass; 2231 else 2232 VRC = &AMDGPU::VGPR_32RegClass; 2233 2234 unsigned Reg = MRI.createVirtualRegister(VRC); 2235 DebugLoc DL = MBB->findDebugLoc(I); 2236 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); 2237 MO.ChangeToRegister(Reg, false); 2238 } 2239 2240 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 2241 MachineRegisterInfo &MRI, 2242 MachineOperand &SuperReg, 2243 const TargetRegisterClass *SuperRC, 2244 unsigned SubIdx, 2245 const TargetRegisterClass *SubRC) 2246 const { 2247 MachineBasicBlock *MBB = MI->getParent(); 2248 DebugLoc DL = MI->getDebugLoc(); 2249 unsigned SubReg = MRI.createVirtualRegister(SubRC); 2250 2251 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 2252 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2253 .addReg(SuperReg.getReg(), 0, SubIdx); 2254 return SubReg; 2255 } 2256 2257 // Just in case the super register is itself a sub-register, copy it to a new 2258 // value so we don't need to worry about merging its subreg index with the 2259 // SubIdx passed to this function. The register coalescer should be able to 2260 // eliminate this extra copy. 2261 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 2262 2263 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 2264 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 2265 2266 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2267 .addReg(NewSuperReg, 0, SubIdx); 2268 2269 return SubReg; 2270 } 2271 2272 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 2273 MachineBasicBlock::iterator MII, 2274 MachineRegisterInfo &MRI, 2275 MachineOperand &Op, 2276 const TargetRegisterClass *SuperRC, 2277 unsigned SubIdx, 2278 const TargetRegisterClass *SubRC) const { 2279 if (Op.isImm()) { 2280 if (SubIdx == AMDGPU::sub0) 2281 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 2282 if (SubIdx == AMDGPU::sub1) 2283 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 2284 2285 llvm_unreachable("Unhandled register index for immediate"); 2286 } 2287 2288 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2289 SubIdx, SubRC); 2290 return MachineOperand::CreateReg(SubReg, false); 2291 } 2292 2293 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2294 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2295 assert(Inst.getNumExplicitOperands() == 3); 2296 MachineOperand Op1 = Inst.getOperand(1); 2297 Inst.RemoveOperand(1); 2298 Inst.addOperand(Op1); 2299 } 2300 2301 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 2302 const MCOperandInfo &OpInfo, 2303 const MachineOperand &MO) const { 2304 if (!MO.isReg()) 2305 return false; 2306 2307 unsigned Reg = MO.getReg(); 2308 const TargetRegisterClass *RC = 2309 TargetRegisterInfo::isVirtualRegister(Reg) ? 2310 MRI.getRegClass(Reg) : 2311 RI.getPhysRegClass(Reg); 2312 2313 const SIRegisterInfo *TRI = 2314 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 2315 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 2316 2317 // In order to be legal, the common sub-class must be equal to the 2318 // class of the current operand. For example: 2319 // 2320 // v_mov_b32 s0 ; Operand defined as vsrc_b32 2321 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 2322 // 2323 // s_sendmsg 0, s0 ; Operand defined as m0reg 2324 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 2325 2326 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 2327 } 2328 2329 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 2330 const MCOperandInfo &OpInfo, 2331 const MachineOperand &MO) const { 2332 if (MO.isReg()) 2333 return isLegalRegOperand(MRI, OpInfo, MO); 2334 2335 // Handle non-register types that are treated like immediates. 2336 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2337 return true; 2338 } 2339 2340 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 2341 const MachineOperand *MO) const { 2342 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2343 const MCInstrDesc &InstDesc = MI.getDesc(); 2344 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 2345 const TargetRegisterClass *DefinedRC = 2346 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 2347 if (!MO) 2348 MO = &MI.getOperand(OpIdx); 2349 2350 if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 2351 2352 RegSubRegPair SGPRUsed; 2353 if (MO->isReg()) 2354 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 2355 2356 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2357 if (i == OpIdx) 2358 continue; 2359 const MachineOperand &Op = MI.getOperand(i); 2360 if (Op.isReg()) { 2361 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 2362 usesConstantBus(MRI, Op, getOpSize(MI, i))) { 2363 return false; 2364 } 2365 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 2366 return false; 2367 } 2368 } 2369 } 2370 2371 if (MO->isReg()) { 2372 assert(DefinedRC); 2373 return isLegalRegOperand(MRI, OpInfo, *MO); 2374 } 2375 2376 // Handle non-register types that are treated like immediates. 2377 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 2378 2379 if (!DefinedRC) { 2380 // This operand expects an immediate. 2381 return true; 2382 } 2383 2384 return isImmOperandLegal(MI, OpIdx, *MO); 2385 } 2386 2387 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 2388 MachineInstr &MI) const { 2389 unsigned Opc = MI.getOpcode(); 2390 const MCInstrDesc &InstrDesc = get(Opc); 2391 2392 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2393 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2394 2395 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 2396 // we need to only have one constant bus use. 2397 // 2398 // Note we do not need to worry about literal constants here. They are 2399 // disabled for the operand type for instructions because they will always 2400 // violate the one constant bus use rule. 2401 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 2402 if (HasImplicitSGPR) { 2403 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2404 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2405 2406 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 2407 legalizeOpWithMove(MI, Src0Idx); 2408 } 2409 2410 // VOP2 src0 instructions support all operand types, so we don't need to check 2411 // their legality. If src1 is already legal, we don't need to do anything. 2412 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 2413 return; 2414 2415 // We do not use commuteInstruction here because it is too aggressive and will 2416 // commute if it is possible. We only want to commute here if it improves 2417 // legality. This can be called a fairly large number of times so don't waste 2418 // compile time pointlessly swapping and checking legality again. 2419 if (HasImplicitSGPR || !MI.isCommutable()) { 2420 legalizeOpWithMove(MI, Src1Idx); 2421 return; 2422 } 2423 2424 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2425 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2426 2427 // If src0 can be used as src1, commuting will make the operands legal. 2428 // Otherwise we have to give up and insert a move. 2429 // 2430 // TODO: Other immediate-like operand kinds could be commuted if there was a 2431 // MachineOperand::ChangeTo* for them. 2432 if ((!Src1.isImm() && !Src1.isReg()) || 2433 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 2434 legalizeOpWithMove(MI, Src1Idx); 2435 return; 2436 } 2437 2438 int CommutedOpc = commuteOpcode(MI); 2439 if (CommutedOpc == -1) { 2440 legalizeOpWithMove(MI, Src1Idx); 2441 return; 2442 } 2443 2444 MI.setDesc(get(CommutedOpc)); 2445 2446 unsigned Src0Reg = Src0.getReg(); 2447 unsigned Src0SubReg = Src0.getSubReg(); 2448 bool Src0Kill = Src0.isKill(); 2449 2450 if (Src1.isImm()) 2451 Src0.ChangeToImmediate(Src1.getImm()); 2452 else if (Src1.isReg()) { 2453 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 2454 Src0.setSubReg(Src1.getSubReg()); 2455 } else 2456 llvm_unreachable("Should only have register or immediate operands"); 2457 2458 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 2459 Src1.setSubReg(Src0SubReg); 2460 } 2461 2462 // Legalize VOP3 operands. Because all operand types are supported for any 2463 // operand, and since literal constants are not allowed and should never be 2464 // seen, we only need to worry about inserting copies if we use multiple SGPR 2465 // operands. 2466 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 2467 MachineInstr &MI) const { 2468 unsigned Opc = MI.getOpcode(); 2469 2470 int VOP3Idx[3] = { 2471 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 2472 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 2473 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 2474 }; 2475 2476 // Find the one SGPR operand we are allowed to use. 2477 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 2478 2479 for (unsigned i = 0; i < 3; ++i) { 2480 int Idx = VOP3Idx[i]; 2481 if (Idx == -1) 2482 break; 2483 MachineOperand &MO = MI.getOperand(Idx); 2484 2485 // We should never see a VOP3 instruction with an illegal immediate operand. 2486 if (!MO.isReg()) 2487 continue; 2488 2489 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2490 continue; // VGPRs are legal 2491 2492 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 2493 SGPRReg = MO.getReg(); 2494 // We can use one SGPR in each VOP3 instruction. 2495 continue; 2496 } 2497 2498 // If we make it this far, then the operand is not legal and we must 2499 // legalize it. 2500 legalizeOpWithMove(MI, Idx); 2501 } 2502 } 2503 2504 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 2505 MachineRegisterInfo &MRI) const { 2506 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2507 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2508 unsigned DstReg = MRI.createVirtualRegister(SRC); 2509 unsigned SubRegs = VRC->getSize() / 4; 2510 2511 SmallVector<unsigned, 8> SRegs; 2512 for (unsigned i = 0; i < SubRegs; ++i) { 2513 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2514 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2515 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2516 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2517 SRegs.push_back(SGPR); 2518 } 2519 2520 MachineInstrBuilder MIB = 2521 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2522 get(AMDGPU::REG_SEQUENCE), DstReg); 2523 for (unsigned i = 0; i < SubRegs; ++i) { 2524 MIB.addReg(SRegs[i]); 2525 MIB.addImm(RI.getSubRegFromChannel(i)); 2526 } 2527 return DstReg; 2528 } 2529 2530 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2531 MachineInstr &MI) const { 2532 2533 // If the pointer is store in VGPRs, then we need to move them to 2534 // SGPRs using v_readfirstlane. This is safe because we only select 2535 // loads with uniform pointers to SMRD instruction so we know the 2536 // pointer value is uniform. 2537 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 2538 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2539 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2540 SBase->setReg(SGPR); 2541 } 2542 } 2543 2544 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 2545 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2546 2547 // Legalize VOP2 2548 if (isVOP2(MI) || isVOPC(MI)) { 2549 legalizeOperandsVOP2(MRI, MI); 2550 return; 2551 } 2552 2553 // Legalize VOP3 2554 if (isVOP3(MI)) { 2555 legalizeOperandsVOP3(MRI, MI); 2556 return; 2557 } 2558 2559 // Legalize SMRD 2560 if (isSMRD(MI)) { 2561 legalizeOperandsSMRD(MRI, MI); 2562 return; 2563 } 2564 2565 // Legalize REG_SEQUENCE and PHI 2566 // The register class of the operands much be the same type as the register 2567 // class of the output. 2568 if (MI.getOpcode() == AMDGPU::PHI) { 2569 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2570 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 2571 if (!MI.getOperand(i).isReg() || 2572 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 2573 continue; 2574 const TargetRegisterClass *OpRC = 2575 MRI.getRegClass(MI.getOperand(i).getReg()); 2576 if (RI.hasVGPRs(OpRC)) { 2577 VRC = OpRC; 2578 } else { 2579 SRC = OpRC; 2580 } 2581 } 2582 2583 // If any of the operands are VGPR registers, then they all most be 2584 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2585 // them. 2586 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 2587 if (!VRC) { 2588 assert(SRC); 2589 VRC = RI.getEquivalentVGPRClass(SRC); 2590 } 2591 RC = VRC; 2592 } else { 2593 RC = SRC; 2594 } 2595 2596 // Update all the operands so they have the same type. 2597 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2598 MachineOperand &Op = MI.getOperand(I); 2599 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2600 continue; 2601 unsigned DstReg = MRI.createVirtualRegister(RC); 2602 2603 // MI is a PHI instruction. 2604 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 2605 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2606 2607 BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) 2608 .addOperand(Op); 2609 Op.setReg(DstReg); 2610 } 2611 } 2612 2613 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2614 // VGPR dest type and SGPR sources, insert copies so all operands are 2615 // VGPRs. This seems to help operand folding / the register coalescer. 2616 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 2617 MachineBasicBlock *MBB = MI.getParent(); 2618 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 2619 if (RI.hasVGPRs(DstRC)) { 2620 // Update all the operands so they are VGPR register classes. These may 2621 // not be the same register class because REG_SEQUENCE supports mixing 2622 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2623 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2624 MachineOperand &Op = MI.getOperand(I); 2625 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2626 continue; 2627 2628 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2629 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2630 if (VRC == OpRC) 2631 continue; 2632 2633 unsigned DstReg = MRI.createVirtualRegister(VRC); 2634 2635 BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) 2636 .addOperand(Op); 2637 2638 Op.setReg(DstReg); 2639 Op.setIsKill(); 2640 } 2641 } 2642 2643 return; 2644 } 2645 2646 // Legalize INSERT_SUBREG 2647 // src0 must have the same register class as dst 2648 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 2649 unsigned Dst = MI.getOperand(0).getReg(); 2650 unsigned Src0 = MI.getOperand(1).getReg(); 2651 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2652 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2653 if (DstRC != Src0RC) { 2654 MachineBasicBlock &MBB = *MI.getParent(); 2655 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2656 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2657 .addReg(Src0); 2658 MI.getOperand(1).setReg(NewSrc0); 2659 } 2660 return; 2661 } 2662 2663 // Legalize MIMG 2664 if (isMIMG(MI)) { 2665 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 2666 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2667 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2668 SRsrc->setReg(SGPR); 2669 } 2670 2671 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 2672 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2673 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2674 SSamp->setReg(SGPR); 2675 } 2676 return; 2677 } 2678 2679 // Legalize MUBUF* instructions 2680 // FIXME: If we start using the non-addr64 instructions for compute, we 2681 // may need to legalize them here. 2682 int SRsrcIdx = 2683 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 2684 if (SRsrcIdx != -1) { 2685 // We have an MUBUF instruction 2686 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 2687 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 2688 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2689 RI.getRegClass(SRsrcRC))) { 2690 // The operands are legal. 2691 // FIXME: We may need to legalize operands besided srsrc. 2692 return; 2693 } 2694 2695 MachineBasicBlock &MBB = *MI.getParent(); 2696 2697 // Extract the ptr from the resource descriptor. 2698 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2699 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2700 2701 // Create an empty resource descriptor 2702 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2703 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2704 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2705 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2706 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2707 2708 // Zero64 = 0 2709 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 2710 .addImm(0); 2711 2712 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2713 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 2714 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2715 2716 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2717 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 2718 .addImm(RsrcDataFormat >> 32); 2719 2720 // NewSRsrc = {Zero64, SRsrcFormat} 2721 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2722 .addReg(Zero64) 2723 .addImm(AMDGPU::sub0_sub1) 2724 .addReg(SRsrcFormatLo) 2725 .addImm(AMDGPU::sub2) 2726 .addReg(SRsrcFormatHi) 2727 .addImm(AMDGPU::sub3); 2728 2729 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 2730 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2731 if (VAddr) { 2732 // This is already an ADDR64 instruction so we need to add the pointer 2733 // extracted from the resource descriptor to the current value of VAddr. 2734 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2735 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2736 2737 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2738 DebugLoc DL = MI.getDebugLoc(); 2739 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2740 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2741 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2742 2743 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2744 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2745 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2746 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2747 2748 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2749 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2750 .addReg(NewVAddrLo) 2751 .addImm(AMDGPU::sub0) 2752 .addReg(NewVAddrHi) 2753 .addImm(AMDGPU::sub1); 2754 } else { 2755 // This instructions is the _OFFSET variant, so we need to convert it to 2756 // ADDR64. 2757 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 2758 < SISubtarget::VOLCANIC_ISLANDS && 2759 "FIXME: Need to emit flat atomics here"); 2760 2761 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 2762 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 2763 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 2764 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 2765 2766 // Atomics rith return have have an additional tied operand and are 2767 // missing some of the special bits. 2768 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 2769 MachineInstr *Addr64; 2770 2771 if (!VDataIn) { 2772 // Regular buffer load / store. 2773 MachineInstrBuilder MIB = 2774 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2775 .addOperand(*VData) 2776 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2777 // This will be replaced later 2778 // with the new value of vaddr. 2779 .addOperand(*SRsrc) 2780 .addOperand(*SOffset) 2781 .addOperand(*Offset); 2782 2783 // Atomics do not have this operand. 2784 if (const MachineOperand *GLC = 2785 getNamedOperand(MI, AMDGPU::OpName::glc)) { 2786 MIB.addImm(GLC->getImm()); 2787 } 2788 2789 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 2790 2791 if (const MachineOperand *TFE = 2792 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 2793 MIB.addImm(TFE->getImm()); 2794 } 2795 2796 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2797 Addr64 = MIB; 2798 } else { 2799 // Atomics with return. 2800 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2801 .addOperand(*VData) 2802 .addOperand(*VDataIn) 2803 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2804 // This will be replaced later 2805 // with the new value of vaddr. 2806 .addOperand(*SRsrc) 2807 .addOperand(*SOffset) 2808 .addOperand(*Offset) 2809 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 2810 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2811 } 2812 2813 MI.removeFromParent(); 2814 2815 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2816 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 2817 NewVAddr) 2818 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2819 .addImm(AMDGPU::sub0) 2820 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2821 .addImm(AMDGPU::sub1); 2822 2823 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 2824 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 2825 } 2826 2827 // Update the instruction to use NewVaddr 2828 VAddr->setReg(NewVAddr); 2829 // Update the instruction to use NewSRsrc 2830 SRsrc->setReg(NewSRsrc); 2831 } 2832 } 2833 2834 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2835 SmallVector<MachineInstr *, 128> Worklist; 2836 Worklist.push_back(&TopInst); 2837 2838 while (!Worklist.empty()) { 2839 MachineInstr &Inst = *Worklist.pop_back_val(); 2840 MachineBasicBlock *MBB = Inst.getParent(); 2841 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2842 2843 unsigned Opcode = Inst.getOpcode(); 2844 unsigned NewOpcode = getVALUOp(Inst); 2845 2846 // Handle some special cases 2847 switch (Opcode) { 2848 default: 2849 break; 2850 case AMDGPU::S_AND_B64: 2851 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2852 Inst.eraseFromParent(); 2853 continue; 2854 2855 case AMDGPU::S_OR_B64: 2856 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2857 Inst.eraseFromParent(); 2858 continue; 2859 2860 case AMDGPU::S_XOR_B64: 2861 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2862 Inst.eraseFromParent(); 2863 continue; 2864 2865 case AMDGPU::S_NOT_B64: 2866 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2867 Inst.eraseFromParent(); 2868 continue; 2869 2870 case AMDGPU::S_BCNT1_I32_B64: 2871 splitScalar64BitBCNT(Worklist, Inst); 2872 Inst.eraseFromParent(); 2873 continue; 2874 2875 case AMDGPU::S_BFE_I64: { 2876 splitScalar64BitBFE(Worklist, Inst); 2877 Inst.eraseFromParent(); 2878 continue; 2879 } 2880 2881 case AMDGPU::S_LSHL_B32: 2882 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2883 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2884 swapOperands(Inst); 2885 } 2886 break; 2887 case AMDGPU::S_ASHR_I32: 2888 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2889 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2890 swapOperands(Inst); 2891 } 2892 break; 2893 case AMDGPU::S_LSHR_B32: 2894 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2895 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2896 swapOperands(Inst); 2897 } 2898 break; 2899 case AMDGPU::S_LSHL_B64: 2900 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2901 NewOpcode = AMDGPU::V_LSHLREV_B64; 2902 swapOperands(Inst); 2903 } 2904 break; 2905 case AMDGPU::S_ASHR_I64: 2906 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2907 NewOpcode = AMDGPU::V_ASHRREV_I64; 2908 swapOperands(Inst); 2909 } 2910 break; 2911 case AMDGPU::S_LSHR_B64: 2912 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2913 NewOpcode = AMDGPU::V_LSHRREV_B64; 2914 swapOperands(Inst); 2915 } 2916 break; 2917 2918 case AMDGPU::S_ABS_I32: 2919 lowerScalarAbs(Worklist, Inst); 2920 Inst.eraseFromParent(); 2921 continue; 2922 2923 case AMDGPU::S_CBRANCH_SCC0: 2924 case AMDGPU::S_CBRANCH_SCC1: 2925 // Clear unused bits of vcc 2926 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 2927 AMDGPU::VCC) 2928 .addReg(AMDGPU::EXEC) 2929 .addReg(AMDGPU::VCC); 2930 break; 2931 2932 case AMDGPU::S_BFE_U64: 2933 case AMDGPU::S_BFM_B64: 2934 llvm_unreachable("Moving this op to VALU not implemented"); 2935 } 2936 2937 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2938 // We cannot move this instruction to the VALU, so we should try to 2939 // legalize its operands instead. 2940 legalizeOperands(Inst); 2941 continue; 2942 } 2943 2944 // Use the new VALU Opcode. 2945 const MCInstrDesc &NewDesc = get(NewOpcode); 2946 Inst.setDesc(NewDesc); 2947 2948 // Remove any references to SCC. Vector instructions can't read from it, and 2949 // We're just about to add the implicit use / defs of VCC, and we don't want 2950 // both. 2951 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 2952 MachineOperand &Op = Inst.getOperand(i); 2953 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2954 Inst.RemoveOperand(i); 2955 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2956 } 2957 } 2958 2959 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2960 // We are converting these to a BFE, so we need to add the missing 2961 // operands for the size and offset. 2962 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2963 Inst.addOperand(MachineOperand::CreateImm(0)); 2964 Inst.addOperand(MachineOperand::CreateImm(Size)); 2965 2966 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2967 // The VALU version adds the second operand to the result, so insert an 2968 // extra 0 operand. 2969 Inst.addOperand(MachineOperand::CreateImm(0)); 2970 } 2971 2972 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 2973 2974 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2975 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 2976 // If we need to move this to VGPRs, we need to unpack the second operand 2977 // back into the 2 separate ones for bit offset and width. 2978 assert(OffsetWidthOp.isImm() && 2979 "Scalar BFE is only implemented for constant width and offset"); 2980 uint32_t Imm = OffsetWidthOp.getImm(); 2981 2982 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2983 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2984 Inst.RemoveOperand(2); // Remove old immediate. 2985 Inst.addOperand(MachineOperand::CreateImm(Offset)); 2986 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 2987 } 2988 2989 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 2990 unsigned NewDstReg = AMDGPU::NoRegister; 2991 if (HasDst) { 2992 // Update the destination register class. 2993 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 2994 if (!NewDstRC) 2995 continue; 2996 2997 unsigned DstReg = Inst.getOperand(0).getReg(); 2998 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2999 MRI.replaceRegWith(DstReg, NewDstReg); 3000 } 3001 3002 // Legalize the operands 3003 legalizeOperands(Inst); 3004 3005 if (HasDst) 3006 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 3007 } 3008 } 3009 3010 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 3011 MachineInstr &Inst) const { 3012 MachineBasicBlock &MBB = *Inst.getParent(); 3013 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3014 MachineBasicBlock::iterator MII = Inst; 3015 DebugLoc DL = Inst.getDebugLoc(); 3016 3017 MachineOperand &Dest = Inst.getOperand(0); 3018 MachineOperand &Src = Inst.getOperand(1); 3019 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3020 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3021 3022 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 3023 .addImm(0) 3024 .addReg(Src.getReg()); 3025 3026 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 3027 .addReg(Src.getReg()) 3028 .addReg(TmpReg); 3029 3030 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3031 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3032 } 3033 3034 void SIInstrInfo::splitScalar64BitUnaryOp( 3035 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3036 unsigned Opcode) const { 3037 MachineBasicBlock &MBB = *Inst.getParent(); 3038 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3039 3040 MachineOperand &Dest = Inst.getOperand(0); 3041 MachineOperand &Src0 = Inst.getOperand(1); 3042 DebugLoc DL = Inst.getDebugLoc(); 3043 3044 MachineBasicBlock::iterator MII = Inst; 3045 3046 const MCInstrDesc &InstDesc = get(Opcode); 3047 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3048 MRI.getRegClass(Src0.getReg()) : 3049 &AMDGPU::SGPR_32RegClass; 3050 3051 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3052 3053 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3054 AMDGPU::sub0, Src0SubRC); 3055 3056 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3057 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3058 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3059 3060 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3061 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3062 .addOperand(SrcReg0Sub0); 3063 3064 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3065 AMDGPU::sub1, Src0SubRC); 3066 3067 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3068 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3069 .addOperand(SrcReg0Sub1); 3070 3071 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3072 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3073 .addReg(DestSub0) 3074 .addImm(AMDGPU::sub0) 3075 .addReg(DestSub1) 3076 .addImm(AMDGPU::sub1); 3077 3078 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3079 3080 // We don't need to legalizeOperands here because for a single operand, src0 3081 // will support any kind of input. 3082 3083 // Move all users of this moved value. 3084 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3085 } 3086 3087 void SIInstrInfo::splitScalar64BitBinaryOp( 3088 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3089 unsigned Opcode) const { 3090 MachineBasicBlock &MBB = *Inst.getParent(); 3091 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3092 3093 MachineOperand &Dest = Inst.getOperand(0); 3094 MachineOperand &Src0 = Inst.getOperand(1); 3095 MachineOperand &Src1 = Inst.getOperand(2); 3096 DebugLoc DL = Inst.getDebugLoc(); 3097 3098 MachineBasicBlock::iterator MII = Inst; 3099 3100 const MCInstrDesc &InstDesc = get(Opcode); 3101 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3102 MRI.getRegClass(Src0.getReg()) : 3103 &AMDGPU::SGPR_32RegClass; 3104 3105 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3106 const TargetRegisterClass *Src1RC = Src1.isReg() ? 3107 MRI.getRegClass(Src1.getReg()) : 3108 &AMDGPU::SGPR_32RegClass; 3109 3110 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 3111 3112 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3113 AMDGPU::sub0, Src0SubRC); 3114 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3115 AMDGPU::sub0, Src1SubRC); 3116 3117 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3118 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3119 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3120 3121 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3122 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3123 .addOperand(SrcReg0Sub0) 3124 .addOperand(SrcReg1Sub0); 3125 3126 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3127 AMDGPU::sub1, Src0SubRC); 3128 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3129 AMDGPU::sub1, Src1SubRC); 3130 3131 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3132 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3133 .addOperand(SrcReg0Sub1) 3134 .addOperand(SrcReg1Sub1); 3135 3136 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3137 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3138 .addReg(DestSub0) 3139 .addImm(AMDGPU::sub0) 3140 .addReg(DestSub1) 3141 .addImm(AMDGPU::sub1); 3142 3143 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3144 3145 // Try to legalize the operands in case we need to swap the order to keep it 3146 // valid. 3147 legalizeOperands(LoHalf); 3148 legalizeOperands(HiHalf); 3149 3150 // Move all users of this moved vlaue. 3151 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3152 } 3153 3154 void SIInstrInfo::splitScalar64BitBCNT( 3155 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { 3156 MachineBasicBlock &MBB = *Inst.getParent(); 3157 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3158 3159 MachineBasicBlock::iterator MII = Inst; 3160 DebugLoc DL = Inst.getDebugLoc(); 3161 3162 MachineOperand &Dest = Inst.getOperand(0); 3163 MachineOperand &Src = Inst.getOperand(1); 3164 3165 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 3166 const TargetRegisterClass *SrcRC = Src.isReg() ? 3167 MRI.getRegClass(Src.getReg()) : 3168 &AMDGPU::SGPR_32RegClass; 3169 3170 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3171 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3172 3173 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 3174 3175 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3176 AMDGPU::sub0, SrcSubRC); 3177 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3178 AMDGPU::sub1, SrcSubRC); 3179 3180 BuildMI(MBB, MII, DL, InstDesc, MidReg) 3181 .addOperand(SrcRegSub0) 3182 .addImm(0); 3183 3184 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 3185 .addOperand(SrcRegSub1) 3186 .addReg(MidReg); 3187 3188 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3189 3190 // We don't need to legalize operands here. src0 for etiher instruction can be 3191 // an SGPR, and the second input is unused or determined here. 3192 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3193 } 3194 3195 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 3196 MachineInstr &Inst) const { 3197 MachineBasicBlock &MBB = *Inst.getParent(); 3198 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3199 MachineBasicBlock::iterator MII = Inst; 3200 DebugLoc DL = Inst.getDebugLoc(); 3201 3202 MachineOperand &Dest = Inst.getOperand(0); 3203 uint32_t Imm = Inst.getOperand(2).getImm(); 3204 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3205 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3206 3207 (void) Offset; 3208 3209 // Only sext_inreg cases handled. 3210 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 3211 Offset == 0 && "Not implemented"); 3212 3213 if (BitWidth < 32) { 3214 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3215 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3216 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3217 3218 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 3219 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 3220 .addImm(0) 3221 .addImm(BitWidth); 3222 3223 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 3224 .addImm(31) 3225 .addReg(MidRegLo); 3226 3227 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3228 .addReg(MidRegLo) 3229 .addImm(AMDGPU::sub0) 3230 .addReg(MidRegHi) 3231 .addImm(AMDGPU::sub1); 3232 3233 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3234 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3235 return; 3236 } 3237 3238 MachineOperand &Src = Inst.getOperand(1); 3239 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3240 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3241 3242 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 3243 .addImm(31) 3244 .addReg(Src.getReg(), 0, AMDGPU::sub0); 3245 3246 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3247 .addReg(Src.getReg(), 0, AMDGPU::sub0) 3248 .addImm(AMDGPU::sub0) 3249 .addReg(TmpReg) 3250 .addImm(AMDGPU::sub1); 3251 3252 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3253 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3254 } 3255 3256 void SIInstrInfo::addUsersToMoveToVALUWorklist( 3257 unsigned DstReg, 3258 MachineRegisterInfo &MRI, 3259 SmallVectorImpl<MachineInstr *> &Worklist) const { 3260 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 3261 E = MRI.use_end(); I != E; ++I) { 3262 MachineInstr &UseMI = *I->getParent(); 3263 if (!canReadVGPR(UseMI, I.getOperandNo())) { 3264 Worklist.push_back(&UseMI); 3265 } 3266 } 3267 } 3268 3269 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 3270 MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { 3271 // This assumes that all the users of SCC are in the same block 3272 // as the SCC def. 3273 for (MachineInstr &MI : 3274 llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), 3275 SCCDefInst.getParent()->end())) { 3276 // Exit if we find another SCC def. 3277 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 3278 return; 3279 3280 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 3281 Worklist.push_back(&MI); 3282 } 3283 } 3284 3285 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 3286 const MachineInstr &Inst) const { 3287 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 3288 3289 switch (Inst.getOpcode()) { 3290 // For target instructions, getOpRegClass just returns the virtual register 3291 // class associated with the operand, so we need to find an equivalent VGPR 3292 // register class in order to move the instruction to the VALU. 3293 case AMDGPU::COPY: 3294 case AMDGPU::PHI: 3295 case AMDGPU::REG_SEQUENCE: 3296 case AMDGPU::INSERT_SUBREG: 3297 if (RI.hasVGPRs(NewDstRC)) 3298 return nullptr; 3299 3300 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 3301 if (!NewDstRC) 3302 return nullptr; 3303 return NewDstRC; 3304 default: 3305 return NewDstRC; 3306 } 3307 } 3308 3309 // Find the one SGPR operand we are allowed to use. 3310 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 3311 int OpIndices[3]) const { 3312 const MCInstrDesc &Desc = MI.getDesc(); 3313 3314 // Find the one SGPR operand we are allowed to use. 3315 // 3316 // First we need to consider the instruction's operand requirements before 3317 // legalizing. Some operands are required to be SGPRs, such as implicit uses 3318 // of VCC, but we are still bound by the constant bus requirement to only use 3319 // one. 3320 // 3321 // If the operand's class is an SGPR, we can never move it. 3322 3323 unsigned SGPRReg = findImplicitSGPRRead(MI); 3324 if (SGPRReg != AMDGPU::NoRegister) 3325 return SGPRReg; 3326 3327 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 3328 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3329 3330 for (unsigned i = 0; i < 3; ++i) { 3331 int Idx = OpIndices[i]; 3332 if (Idx == -1) 3333 break; 3334 3335 const MachineOperand &MO = MI.getOperand(Idx); 3336 if (!MO.isReg()) 3337 continue; 3338 3339 // Is this operand statically required to be an SGPR based on the operand 3340 // constraints? 3341 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 3342 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 3343 if (IsRequiredSGPR) 3344 return MO.getReg(); 3345 3346 // If this could be a VGPR or an SGPR, Check the dynamic register class. 3347 unsigned Reg = MO.getReg(); 3348 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 3349 if (RI.isSGPRClass(RegRC)) 3350 UsedSGPRs[i] = Reg; 3351 } 3352 3353 // We don't have a required SGPR operand, so we have a bit more freedom in 3354 // selecting operands to move. 3355 3356 // Try to select the most used SGPR. If an SGPR is equal to one of the 3357 // others, we choose that. 3358 // 3359 // e.g. 3360 // V_FMA_F32 v0, s0, s0, s0 -> No moves 3361 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 3362 3363 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 3364 // prefer those. 3365 3366 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 3367 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 3368 SGPRReg = UsedSGPRs[0]; 3369 } 3370 3371 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 3372 if (UsedSGPRs[1] == UsedSGPRs[2]) 3373 SGPRReg = UsedSGPRs[1]; 3374 } 3375 3376 return SGPRReg; 3377 } 3378 3379 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 3380 unsigned OperandName) const { 3381 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 3382 if (Idx == -1) 3383 return nullptr; 3384 3385 return &MI.getOperand(Idx); 3386 } 3387 3388 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 3389 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 3390 if (ST.isAmdHsaOS()) { 3391 RsrcDataFormat |= (1ULL << 56); 3392 3393 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3394 // Set MTYPE = 2 3395 RsrcDataFormat |= (2ULL << 59); 3396 } 3397 3398 return RsrcDataFormat; 3399 } 3400 3401 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 3402 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 3403 AMDGPU::RSRC_TID_ENABLE | 3404 0xffffffff; // Size; 3405 3406 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 3407 3408 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | 3409 // IndexStride = 64 3410 (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); 3411 3412 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 3413 // Clear them unless we want a huge stride. 3414 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3415 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 3416 3417 return Rsrc23; 3418 } 3419 3420 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 3421 unsigned Opc = MI.getOpcode(); 3422 3423 return isSMRD(Opc); 3424 } 3425 3426 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 3427 unsigned Opc = MI.getOpcode(); 3428 3429 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 3430 } 3431 3432 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 3433 int &FrameIndex) const { 3434 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3435 if (!Addr || !Addr->isFI()) 3436 return AMDGPU::NoRegister; 3437 3438 assert(!MI.memoperands_empty() && 3439 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 3440 3441 FrameIndex = Addr->getIndex(); 3442 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 3443 } 3444 3445 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 3446 int &FrameIndex) const { 3447 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 3448 assert(Addr && Addr->isFI()); 3449 FrameIndex = Addr->getIndex(); 3450 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 3451 } 3452 3453 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 3454 int &FrameIndex) const { 3455 3456 if (!MI.mayLoad()) 3457 return AMDGPU::NoRegister; 3458 3459 if (isMUBUF(MI) || isVGPRSpill(MI)) 3460 return isStackAccess(MI, FrameIndex); 3461 3462 if (isSGPRSpill(MI)) 3463 return isSGPRStackAccess(MI, FrameIndex); 3464 3465 return AMDGPU::NoRegister; 3466 } 3467 3468 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 3469 int &FrameIndex) const { 3470 if (!MI.mayStore()) 3471 return AMDGPU::NoRegister; 3472 3473 if (isMUBUF(MI) || isVGPRSpill(MI)) 3474 return isStackAccess(MI, FrameIndex); 3475 3476 if (isSGPRSpill(MI)) 3477 return isSGPRStackAccess(MI, FrameIndex); 3478 3479 return AMDGPU::NoRegister; 3480 } 3481 3482 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 3483 unsigned Opc = MI.getOpcode(); 3484 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 3485 unsigned DescSize = Desc.getSize(); 3486 3487 // If we have a definitive size, we can use it. Otherwise we need to inspect 3488 // the operands to know the size. 3489 // 3490 // FIXME: Instructions that have a base 32-bit encoding report their size as 3491 // 4, even though they are really 8 bytes if they have a literal operand. 3492 if (DescSize != 0 && DescSize != 4) 3493 return DescSize; 3494 3495 // 4-byte instructions may have a 32-bit literal encoded after them. Check 3496 // operands that coud ever be literals. 3497 if (isVALU(MI) || isSALU(MI)) { 3498 if (isFixedSize(MI)) { 3499 assert(DescSize == 4); 3500 return DescSize; 3501 } 3502 3503 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3504 if (Src0Idx == -1) 3505 return 4; // No operands. 3506 3507 if (isLiteralConstantLike(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) 3508 return 8; 3509 3510 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3511 if (Src1Idx == -1) 3512 return 4; 3513 3514 if (isLiteralConstantLike(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) 3515 return 8; 3516 3517 return 4; 3518 } 3519 3520 if (DescSize == 4) 3521 return 4; 3522 3523 switch (Opc) { 3524 case AMDGPU::SI_MASK_BRANCH: 3525 case TargetOpcode::IMPLICIT_DEF: 3526 case TargetOpcode::KILL: 3527 case TargetOpcode::DBG_VALUE: 3528 case TargetOpcode::BUNDLE: 3529 case TargetOpcode::EH_LABEL: 3530 return 0; 3531 case TargetOpcode::INLINEASM: { 3532 const MachineFunction *MF = MI.getParent()->getParent(); 3533 const char *AsmStr = MI.getOperand(0).getSymbolName(); 3534 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 3535 } 3536 default: 3537 llvm_unreachable("unable to find instruction size"); 3538 } 3539 } 3540 3541 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 3542 if (!isFLAT(MI)) 3543 return false; 3544 3545 if (MI.memoperands_empty()) 3546 return true; 3547 3548 for (const MachineMemOperand *MMO : MI.memoperands()) { 3549 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 3550 return true; 3551 } 3552 return false; 3553 } 3554 3555 ArrayRef<std::pair<int, const char *>> 3556 SIInstrInfo::getSerializableTargetIndices() const { 3557 static const std::pair<int, const char *> TargetIndices[] = { 3558 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 3559 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 3560 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 3561 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 3562 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 3563 return makeArrayRef(TargetIndices); 3564 } 3565 3566 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 3567 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 3568 ScheduleHazardRecognizer * 3569 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 3570 const ScheduleDAG *DAG) const { 3571 return new GCNHazardRecognizer(DAG->MF); 3572 } 3573 3574 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 3575 /// pass. 3576 ScheduleHazardRecognizer * 3577 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 3578 return new GCNHazardRecognizer(MF); 3579 } 3580